Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/core/cpu_recompiler_arm64.cpp
4802 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "cpu_recompiler_arm64.h"
5
#include "cpu_core_private.h"
6
#include "cpu_pgxp.h"
7
#include "gte.h"
8
#include "settings.h"
9
#include "timing_event.h"
10
11
#include "common/align.h"
12
#include "common/assert.h"
13
#include "common/log.h"
14
#include "common/memmap.h"
15
#include "common/string_util.h"
16
17
#include <limits>
18
19
#ifdef CPU_ARCH_ARM64
20
21
#include "vixl/aarch64/constants-aarch64.h"
22
23
#ifdef ENABLE_HOST_DISASSEMBLY
24
#include "vixl/aarch64/disasm-aarch64.h"
25
#endif
26
27
LOG_CHANNEL(Recompiler);
28
29
#define PTR(x) vixl::aarch64::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))
30
31
#define RWRET vixl::aarch64::w0
32
#define RXRET vixl::aarch64::x0
33
#define RWARG1 vixl::aarch64::w0
34
#define RXARG1 vixl::aarch64::x0
35
#define RWARG2 vixl::aarch64::w1
36
#define RXARG2 vixl::aarch64::x1
37
#define RWARG3 vixl::aarch64::w2
38
#define RXARG3 vixl::aarch64::x2
39
#define RWSCRATCH vixl::aarch64::w16
40
#define RXSCRATCH vixl::aarch64::x16
41
#define RSTATE vixl::aarch64::x19
42
#define RMEMBASE vixl::aarch64::x20
43
44
static bool armIsCallerSavedRegister(u32 id);
45
static s64 armGetPCDisplacement(const void* current, const void* target);
46
static bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr);
47
static void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr);
48
static void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm);
49
static void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
50
static void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);
51
static void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr);
52
static void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
53
bool sign_extend_word = false);
54
static void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
55
const vixl::aarch64::Register& tempreg = RXSCRATCH);
56
static u8* armGetJumpTrampoline(const void* target);
57
static void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment);
58
59
static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;
60
static std::unordered_map<const void*, u32> s_trampoline_targets;
61
static u8* s_trampoline_start_ptr = nullptr;
62
static u32 s_trampoline_used = 0;
63
64
namespace CPU {
65
66
using namespace vixl::aarch64;
67
68
static ARM64Recompiler s_instance;
69
Recompiler* g_compiler = &s_instance;
70
71
} // namespace CPU
72
73
bool armIsCallerSavedRegister(u32 id)
74
{
75
// same on both linux and windows
76
return (id <= 18);
77
}
78
79
void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm)
80
{
81
// From vixl macro assembler.
82
DebugAssert(vixl::IsUint32(imm) || vixl::IsInt32(imm) || rd.Is64Bits());
83
DebugAssert(rd.GetCode() != vixl::aarch64::sp.GetCode());
84
85
if (imm == 0)
86
{
87
armAsm->mov(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd));
88
return;
89
}
90
91
// The worst case for size is mov 64-bit immediate to sp:
92
// * up to 4 instructions to materialise the constant
93
// * 1 instruction to move to sp
94
95
// Immediates on Aarch64 can be produced using an initial value, and zero to
96
// three move keep operations.
97
//
98
// Initial values can be generated with:
99
// 1. 64-bit move zero (movz).
100
// 2. 32-bit move inverted (movn).
101
// 3. 64-bit move inverted.
102
// 4. 32-bit orr immediate.
103
// 5. 64-bit orr immediate.
104
// Move-keep may then be used to modify each of the 16-bit half words.
105
//
106
// The code below supports all five initial value generators, and
107
// applying move-keep operations to move-zero and move-inverted initial
108
// values.
109
110
// Try to move the immediate in one instruction, and if that fails, switch to
111
// using multiple instructions.
112
const unsigned reg_size = rd.GetSizeInBits();
113
114
if (vixl::aarch64::Assembler::IsImmMovz(imm, reg_size) && !rd.IsSP())
115
{
116
// Immediate can be represented in a move zero instruction. Movz can't write
117
// to the stack pointer.
118
armAsm->movz(rd, imm);
119
return;
120
}
121
else if (vixl::aarch64::Assembler::IsImmMovn(imm, reg_size) && !rd.IsSP())
122
{
123
// Immediate can be represented in a move negative instruction. Movn can't
124
// write to the stack pointer.
125
armAsm->movn(rd, rd.Is64Bits() ? ~imm : (~imm & vixl::aarch64::kWRegMask));
126
return;
127
}
128
else if (vixl::aarch64::Assembler::IsImmLogical(imm, reg_size))
129
{
130
// Immediate can be represented in a logical orr instruction.
131
DebugAssert(!rd.IsZero());
132
armAsm->orr(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd), imm);
133
return;
134
}
135
136
// Generic immediate case. Imm will be represented by
137
// [imm3, imm2, imm1, imm0], where each imm is 16 bits.
138
// A move-zero or move-inverted is generated for the first non-zero or
139
// non-0xffff immX, and a move-keep for subsequent non-zero immX.
140
141
uint64_t ignored_halfword = 0;
142
bool invert_move = false;
143
// If the number of 0xffff halfwords is greater than the number of 0x0000
144
// halfwords, it's more efficient to use move-inverted.
145
if (vixl::CountClearHalfWords(~imm, reg_size) > vixl::CountClearHalfWords(imm, reg_size))
146
{
147
ignored_halfword = 0xffff;
148
invert_move = true;
149
}
150
151
// Iterate through the halfwords. Use movn/movz for the first non-ignored
152
// halfword, and movk for subsequent halfwords.
153
DebugAssert((reg_size % 16) == 0);
154
bool first_mov_done = false;
155
for (unsigned i = 0; i < (reg_size / 16); i++)
156
{
157
uint64_t imm16 = (imm >> (16 * i)) & 0xffff;
158
if (imm16 != ignored_halfword)
159
{
160
if (!first_mov_done)
161
{
162
if (invert_move)
163
armAsm->movn(rd, ~imm16 & 0xffff, 16 * i);
164
else
165
armAsm->movz(rd, imm16, 16 * i);
166
first_mov_done = true;
167
}
168
else
169
{
170
// Construct a wider constant.
171
armAsm->movk(rd, imm16, 16 * i);
172
}
173
}
174
}
175
176
DebugAssert(first_mov_done);
177
}
178
179
s64 armGetPCDisplacement(const void* current, const void* target)
180
{
181
// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));
182
// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));
183
return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);
184
}
185
186
bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr)
187
{
188
const void* cur = armAsm->GetCursorAddress<const void*>();
189
const void* current_code_ptr_page =
190
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
191
const void* ptr_page =
192
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
193
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
194
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
195
196
return (vixl::IsInt21(page_displacement) && (vixl::aarch64::Assembler::IsImmAddSub(page_offset) ||
197
vixl::aarch64::Assembler::IsImmLogical(page_offset, 64)));
198
}
199
200
void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr)
201
{
202
DebugAssert(reg.IsX());
203
204
const void* cur = armAsm->GetCursorAddress<const void*>();
205
const void* current_code_ptr_page =
206
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
207
const void* ptr_page =
208
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
209
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
210
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
211
if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmAddSub(page_offset))
212
{
213
armAsm->adrp(reg, page_displacement);
214
armAsm->add(reg, reg, page_offset);
215
}
216
else if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmLogical(page_offset, 64))
217
{
218
armAsm->adrp(reg, page_displacement);
219
armAsm->orr(reg, reg, page_offset);
220
}
221
else
222
{
223
armEmitMov(armAsm, reg, reinterpret_cast<uintptr_t>(addr));
224
}
225
}
226
227
void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)
228
{
229
const void* cur = armAsm->GetCursorAddress<const void*>();
230
s64 displacement = armGetPCDisplacement(cur, ptr);
231
bool use_blr = !vixl::IsInt26(displacement);
232
bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);
233
if (use_blr && use_trampoline && !force_inline)
234
{
235
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
236
{
237
displacement = armGetPCDisplacement(cur, trampoline);
238
use_blr = !vixl::IsInt26(displacement);
239
}
240
}
241
242
if (use_blr)
243
{
244
armMoveAddressToReg(armAsm, RXSCRATCH, ptr);
245
armAsm->br(RXSCRATCH);
246
}
247
else
248
{
249
armAsm->b(displacement);
250
}
251
}
252
253
void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)
254
{
255
const void* cur = armAsm->GetCursorAddress<const void*>();
256
s64 displacement = armGetPCDisplacement(cur, ptr);
257
bool use_blr = !vixl::IsInt26(displacement);
258
bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);
259
if (use_blr && use_trampoline && !force_inline)
260
{
261
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
262
{
263
displacement = armGetPCDisplacement(cur, trampoline);
264
use_blr = !vixl::IsInt26(displacement);
265
}
266
}
267
268
if (use_blr)
269
{
270
armMoveAddressToReg(armAsm, RXSCRATCH, ptr);
271
armAsm->blr(RXSCRATCH);
272
}
273
else
274
{
275
armAsm->bl(displacement);
276
}
277
}
278
279
void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr)
280
{
281
const s64 jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -
282
reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));
283
// pxAssert(Common::IsAligned(jump_distance, 4));
284
285
if (vixl::aarch64::Instruction::IsValidImmPCOffset(vixl::aarch64::CondBranchType, jump_distance >> 2))
286
{
287
armAsm->b(jump_distance >> 2, cond);
288
}
289
else
290
{
291
vixl::aarch64::Label branch_not_taken;
292
armAsm->b(&branch_not_taken, InvertCondition(cond));
293
294
const s64 new_jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -
295
reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));
296
armAsm->b(new_jump_distance >> 2);
297
armAsm->bind(&branch_not_taken);
298
}
299
}
300
301
void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,
302
bool sign_extend_word)
303
{
304
const void* cur = armAsm->GetCursorAddress<const void*>();
305
const void* current_code_ptr_page =
306
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
307
const void* ptr_page =
308
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
309
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
310
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
311
vixl::aarch64::MemOperand memop;
312
313
const vixl::aarch64::Register xreg = reg.X();
314
if (vixl::IsInt21(page_displacement))
315
{
316
armAsm->adrp(xreg, page_displacement);
317
memop = vixl::aarch64::MemOperand(xreg, static_cast<int64_t>(page_offset));
318
}
319
else
320
{
321
armMoveAddressToReg(armAsm, xreg, addr);
322
memop = vixl::aarch64::MemOperand(xreg);
323
}
324
325
if (sign_extend_word)
326
armAsm->ldrsw(reg, memop);
327
else
328
armAsm->ldr(reg, memop);
329
}
330
331
[[maybe_unused]] void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg,
332
const void* addr, const vixl::aarch64::Register& tempreg)
333
{
334
DebugAssert(tempreg.IsX());
335
336
const void* cur = armAsm->GetCursorAddress<const void*>();
337
const void* current_code_ptr_page =
338
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));
339
const void* ptr_page =
340
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
341
const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
342
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
343
344
if (vixl::IsInt21(page_displacement))
345
{
346
armAsm->adrp(tempreg, page_displacement);
347
armAsm->str(reg, vixl::aarch64::MemOperand(tempreg, static_cast<int64_t>(page_offset)));
348
}
349
else
350
{
351
armMoveAddressToReg(armAsm, tempreg, addr);
352
armAsm->str(reg, vixl::aarch64::MemOperand(tempreg));
353
}
354
}
355
356
u8* armGetJumpTrampoline(const void* target)
357
{
358
auto it = s_trampoline_targets.find(target);
359
if (it != s_trampoline_targets.end())
360
return s_trampoline_start_ptr + it->second;
361
362
// align to 16 bytes?
363
const u32 offset = Common::AlignUpPow2(s_trampoline_used, CPU::Recompiler::FUNCTION_ALIGNMENT);
364
365
// 4 movs plus a jump
366
if (TRAMPOLINE_AREA_SIZE - offset < 20)
367
{
368
Panic("Ran out of space in constant pool");
369
return nullptr;
370
}
371
372
u8* start = s_trampoline_start_ptr + offset;
373
vixl::aarch64::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);
374
#ifdef VIXL_DEBUG
375
vixl::CodeBufferCheckScope armAsmCheck(&armAsm, TRAMPOLINE_AREA_SIZE - offset,
376
vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
377
#endif
378
armMoveAddressToReg(&armAsm, RXSCRATCH, target);
379
armAsm.br(RXSCRATCH);
380
armAsm.FinalizeCode();
381
382
const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());
383
DebugAssert(size < 20);
384
s_trampoline_targets.emplace(target, offset);
385
s_trampoline_used = offset + static_cast<u32>(size);
386
387
MemMap::FlushInstructionCache(start, size);
388
return start;
389
}
390
391
void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment)
392
{
393
size_t addr = armAsm->GetCursorAddress<size_t>();
394
const size_t end_addr = Common::AlignUpPow2(addr, alignment);
395
while (addr != end_addr)
396
{
397
armAsm->nop();
398
addr += vixl::aarch64::kInstructionSize;
399
}
400
}
401
402
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
403
{
404
#ifdef ENABLE_HOST_DISASSEMBLY
405
class MyDisassembler : public vixl::aarch64::Disassembler
406
{
407
protected:
408
void ProcessOutput(const vixl::aarch64::Instruction* instr) override
409
{
410
DEBUG_LOG("0x{:016X} {:08X}\t\t{}", reinterpret_cast<uint64_t>(instr), instr->GetInstructionBits(), GetOutput());
411
}
412
};
413
414
vixl::aarch64::Decoder decoder;
415
MyDisassembler disas;
416
decoder.AppendVisitor(&disas);
417
decoder.Decode(static_cast<const vixl::aarch64::Instruction*>(start),
418
reinterpret_cast<const vixl::aarch64::Instruction*>(static_cast<const u8*>(start) + size));
419
#else
420
ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
421
#endif
422
}
423
424
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
425
{
426
return size / vixl::aarch64::kInstructionSize;
427
}
428
429
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
430
{
431
using namespace vixl::aarch64;
432
433
const s64 disp = armGetPCDisplacement(code, dst);
434
DebugAssert(vixl::IsInt26(disp));
435
436
const u32 new_code = B | Assembler::ImmUncondBranch(disp);
437
std::memcpy(code, &new_code, sizeof(new_code));
438
if (flush_icache)
439
MemMap::FlushInstructionCache(code, kInstructionSize);
440
441
return kInstructionSize;
442
}
443
444
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
445
{
446
using namespace vixl::aarch64;
447
448
Assembler actual_asm(static_cast<u8*>(code), code_size);
449
Assembler* RESTRICT armAsm = &actual_asm;
450
451
#ifdef VIXL_DEBUG
452
vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
453
#endif
454
455
Label dispatch;
456
Label run_events_and_dispatch;
457
458
g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();
459
{
460
// Need the CPU state for basically everything :-)
461
armMoveAddressToReg(armAsm, RSTATE, &g_state);
462
463
// Fastmem setup, oldrec doesn't need it
464
if (IsUsingFastmem())
465
armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
466
467
// Fall through to event dispatcher
468
}
469
470
// check events then for frame done
471
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
472
{
473
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
474
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
475
armAsm->cmp(RWARG1, RWARG2);
476
armAsm->b(&dispatch, lt);
477
478
g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
479
armAsm->bind(&run_events_and_dispatch);
480
armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);
481
}
482
483
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
484
g_dispatcher = armAsm->GetCursorAddress<const void*>();
485
{
486
armAsm->bind(&dispatch);
487
488
// x9 <- s_fast_map[pc >> 16]
489
armAsm->ldr(RWARG1, PTR(&g_state.pc));
490
armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data());
491
armAsm->lsr(RWARG2, RWARG1, 16);
492
armAsm->ubfx(RWARG1, RWARG1, 2, 14);
493
armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3));
494
495
// blr(x9[pc * 2]) (fast_map[pc >> 2])
496
armAsm->ldr(RXARG1, MemOperand(RXARG2, RXARG1, LSL, 3));
497
armAsm->br(RXARG1);
498
}
499
500
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
501
g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();
502
{
503
armAsm->ldr(RWARG1, PTR(&g_state.pc));
504
armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);
505
armAsm->b(&dispatch);
506
}
507
508
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
509
g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();
510
{
511
armAsm->ldr(RWARG1, PTR(&g_state.pc));
512
armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);
513
armAsm->b(&dispatch);
514
}
515
516
armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);
517
g_interpret_block = armAsm->GetCursorAddress<const void*>();
518
{
519
armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);
520
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
521
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
522
armAsm->cmp(RWARG1, RWARG2);
523
armAsm->b(&run_events_and_dispatch, ge);
524
armAsm->b(&dispatch);
525
}
526
527
armAsm->FinalizeCode();
528
529
s_trampoline_targets.clear();
530
s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
531
s_trampoline_used = 0;
532
533
return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;
534
}
535
536
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
537
{
538
constexpr u8 padding_value = 0x00;
539
std::memset(dst, padding_value, size);
540
}
541
542
CPU::ARM64Recompiler::ARM64Recompiler() : m_emitter(PositionDependentCode), m_far_emitter(PositionIndependentCode)
543
{
544
}
545
546
CPU::ARM64Recompiler::~ARM64Recompiler() = default;
547
548
const void* CPU::ARM64Recompiler::GetCurrentCodePointer()
549
{
550
return armAsm->GetCursorAddress<const void*>();
551
}
552
553
void CPU::ARM64Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
554
u32 far_code_space)
555
{
556
Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
557
558
// TODO: don't recreate this every time..
559
DebugAssert(!armAsm);
560
m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);
561
m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);
562
armAsm = &m_emitter;
563
564
#ifdef VIXL_DEBUG
565
m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(&m_emitter, code_buffer_space,
566
vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
567
m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(
568
&m_far_emitter, far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
569
#endif
570
571
// Need to wipe it out so it's correct when toggling fastmem.
572
m_host_regs = {};
573
574
const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS;
575
for (u32 i = 0; i < NUM_HOST_REGS; i++)
576
{
577
HostRegAlloc& ra = m_host_regs[i];
578
579
if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() ||
580
i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i >= 30)
581
{
582
continue;
583
}
584
585
ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
586
}
587
}
588
589
void CPU::ARM64Recompiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond)
590
{
591
DebugAssert(armAsm == &m_emitter);
592
if (emit_jump)
593
{
594
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
595
if (cond != Condition::al)
596
{
597
if (vixl::IsInt19(disp))
598
{
599
armAsm->b(disp, cond);
600
}
601
else
602
{
603
Label skip;
604
armAsm->b(&skip, vixl::aarch64::InvertCondition(cond));
605
armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
606
armAsm->bind(&skip);
607
}
608
}
609
else
610
{
611
armAsm->b(disp);
612
}
613
}
614
armAsm = &m_far_emitter;
615
}
616
617
void CPU::ARM64Recompiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit)
618
{
619
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
620
if (vixl::IsInt14(disp))
621
{
622
armAsm->tbnz(reg, bit, disp);
623
}
624
else
625
{
626
Label skip;
627
armAsm->tbz(reg, bit, &skip);
628
armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
629
armAsm->bind(&skip);
630
}
631
632
armAsm = &m_far_emitter;
633
}
634
635
void CPU::ARM64Recompiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero)
636
{
637
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
638
if (vixl::IsInt19(disp))
639
{
640
nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp);
641
}
642
else
643
{
644
Label skip;
645
nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip);
646
armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));
647
armAsm->bind(&skip);
648
}
649
650
armAsm = &m_far_emitter;
651
}
652
653
void CPU::ARM64Recompiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond)
654
{
655
DebugAssert(armAsm == &m_far_emitter);
656
if (emit_jump)
657
{
658
const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());
659
(cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp);
660
}
661
armAsm = &m_emitter;
662
}
663
664
void CPU::ARM64Recompiler::EmitMov(const vixl::aarch64::Register& dst, u32 val)
665
{
666
armEmitMov(armAsm, dst, val);
667
}
668
669
void CPU::ARM64Recompiler::EmitCall(const void* ptr, bool force_inline /*= false*/)
670
{
671
armEmitCall(armAsm, ptr, force_inline);
672
}
673
674
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(s32 val)
675
{
676
if (Assembler::IsImmAddSub(val))
677
return vixl::aarch64::Operand(static_cast<int64_t>(val));
678
679
EmitMov(RWSCRATCH, static_cast<u32>(val));
680
return vixl::aarch64::Operand(RWSCRATCH);
681
}
682
683
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(u32 val)
684
{
685
return armCheckAddSubConstant(static_cast<s32>(val));
686
}
687
688
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckCompareConstant(s32 val)
689
{
690
if (Assembler::IsImmConditionalCompare(val))
691
return vixl::aarch64::Operand(static_cast<int64_t>(val));
692
693
EmitMov(RWSCRATCH, static_cast<u32>(val));
694
return vixl::aarch64::Operand(RWSCRATCH);
695
}
696
697
vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckLogicalConstant(u32 val)
698
{
699
if (Assembler::IsImmLogical(val, 32))
700
return vixl::aarch64::Operand(static_cast<s64>(static_cast<u64>(val)));
701
702
EmitMov(RWSCRATCH, val);
703
return vixl::aarch64::Operand(RWSCRATCH);
704
}
705
706
void CPU::ARM64Recompiler::BeginBlock()
707
{
708
Recompiler::BeginBlock();
709
}
710
711
void CPU::ARM64Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
712
{
713
// store it first to reduce code size, because we can offset
714
armMoveAddressToReg(armAsm, RXARG1, ram_ptr);
715
armMoveAddressToReg(armAsm, RXARG2, shadow_ptr);
716
717
bool first = true;
718
u32 offset = 0;
719
Label block_changed;
720
721
while (size >= 16)
722
{
723
const VRegister vtmp = v2.V4S();
724
const VRegister dst = first ? v0.V4S() : v1.V4S();
725
armAsm->ldr(dst, MemOperand(RXARG1, offset));
726
armAsm->ldr(vtmp, MemOperand(RXARG2, offset));
727
armAsm->cmeq(dst, dst, vtmp);
728
if (!first)
729
armAsm->and_(v0.V16B(), v0.V16B(), dst.V16B());
730
else
731
first = false;
732
733
offset += 16;
734
size -= 16;
735
}
736
737
if (!first)
738
{
739
// TODO: make sure this doesn't choke on ffffffff
740
armAsm->uminv(s0, v0.V4S());
741
armAsm->fcmp(s0, 0.0);
742
armAsm->b(&block_changed, eq);
743
}
744
745
while (size >= 8)
746
{
747
armAsm->ldr(RXARG3, MemOperand(RXARG1, offset));
748
armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset));
749
armAsm->cmp(RXARG3, RXSCRATCH);
750
armAsm->b(&block_changed, ne);
751
offset += 8;
752
size -= 8;
753
}
754
755
while (size >= 4)
756
{
757
armAsm->ldr(RWARG3, MemOperand(RXARG1, offset));
758
armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset));
759
armAsm->cmp(RWARG3, RWSCRATCH);
760
armAsm->b(&block_changed, ne);
761
offset += 4;
762
size -= 4;
763
}
764
765
DebugAssert(size == 0);
766
767
Label block_unchanged;
768
armAsm->b(&block_unchanged);
769
armAsm->bind(&block_changed);
770
armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);
771
armAsm->bind(&block_unchanged);
772
}
773
774
void CPU::ARM64Recompiler::GenerateICacheCheckAndUpdate()
775
{
776
if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
777
{
778
if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
779
{
780
armEmitFarLoad(armAsm, RWARG2, GetFetchMemoryAccessTimePtr());
781
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
782
armEmitMov(armAsm, RWARG3, m_block->size);
783
armAsm->mul(RWARG2, RWARG2, RWARG3);
784
armAsm->add(RWARG1, RWARG1, RWARG2);
785
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
786
}
787
else
788
{
789
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
790
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));
791
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
792
}
793
}
794
else if (m_block->icache_line_count > 0)
795
{
796
const auto& ticks_reg = RWARG1;
797
const auto& current_tag_reg = RWARG2;
798
const auto& existing_tag_reg = RWARG3;
799
const auto& fill_ticks_reg = w4;
800
const auto& ticks_to_add_reg = w5;
801
802
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
803
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
804
if (fill_ticks <= 0)
805
return;
806
807
armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
808
armEmitMov(armAsm, current_tag_reg, current_pc);
809
armEmitMov(armAsm, fill_ticks_reg, fill_ticks);
810
811
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
812
{
813
const u32 line = GetICacheLine(current_pc);
814
const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
815
816
Label cache_hit;
817
armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));
818
armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));
819
armAsm->cmp(existing_tag_reg, current_tag_reg);
820
armAsm->csel(ticks_to_add_reg, fill_ticks_reg, wzr, ne);
821
armAsm->add(ticks_reg, ticks_reg, ticks_to_add_reg);
822
823
if (i != (m_block->icache_line_count - 1))
824
armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));
825
}
826
827
armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));
828
}
829
}
830
831
void CPU::ARM64Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
832
s32 arg3reg /*= -1*/)
833
{
834
if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.GetCode()))
835
armAsm->mov(RXARG1, XRegister(arg1reg));
836
if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.GetCode()))
837
armAsm->mov(RXARG2, XRegister(arg2reg));
838
if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.GetCode()))
839
armAsm->mov(RXARG3, XRegister(arg3reg));
840
EmitCall(func);
841
}
842
843
void CPU::ARM64Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
844
{
845
if (newpc.has_value())
846
{
847
if (m_dirty_pc || m_compiler_pc != newpc)
848
{
849
EmitMov(RWSCRATCH, newpc.value());
850
armAsm->str(RWSCRATCH, PTR(&g_state.pc));
851
}
852
}
853
m_dirty_pc = false;
854
855
// flush regs
856
Flush(FLUSH_END_BLOCK);
857
EndAndLinkBlock(newpc, do_event_test, false);
858
}
859
860
void CPU::ARM64Recompiler::EndBlockWithException(Exception excode)
861
{
862
// flush regs, but not pc, it's going to get overwritten
863
// flush cycles because of the GTE instruction stuff...
864
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
865
866
// TODO: flush load delay
867
868
EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
869
inst->cop.cop_n));
870
EmitMov(RWARG2, m_current_instruction_pc);
871
if (excode != Exception::BP)
872
{
873
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
874
}
875
else
876
{
877
EmitMov(RWARG3, inst->bits);
878
EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));
879
}
880
m_dirty_pc = false;
881
882
EndAndLinkBlock(std::nullopt, true, false);
883
}
884
885
void CPU::ARM64Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)
886
{
887
// event test
888
// pc should've been flushed
889
DebugAssert(!m_dirty_pc && !m_block_ended);
890
m_block_ended = true;
891
892
// TODO: try extracting this to a function
893
894
// save cycles for event test
895
const TickCount cycles = std::exchange(m_cycles, 0);
896
897
// pending_ticks += cycles
898
// if (pending_ticks >= downcount) { dispatch_event(); }
899
if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
900
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
901
if (do_event_test)
902
armAsm->ldr(RWARG2, PTR(&g_state.downcount));
903
if (cycles > 0)
904
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles));
905
if (m_gte_done_cycle > cycles)
906
{
907
armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));
908
armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick));
909
}
910
if (do_event_test)
911
armAsm->cmp(RWARG1, RWARG2);
912
if (cycles > 0)
913
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
914
if (do_event_test)
915
armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);
916
917
// jump to dispatcher or next block
918
if (force_run_events)
919
{
920
armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);
921
}
922
else if (!newpc.has_value())
923
{
924
armEmitJmp(armAsm, CodeCache::g_dispatcher, false);
925
}
926
else
927
{
928
const void* target = (newpc.value() == m_block->pc) ?
929
CodeCache::CreateSelfBlockLink(m_block, armAsm->GetCursorAddress<void*>(),
930
armAsm->GetBuffer()->GetStartAddress<const void*>()) :
931
CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());
932
armEmitJmp(armAsm, target, true);
933
}
934
}
935
936
const void* CPU::ARM64Recompiler::EndCompile(u32* code_size, u32* far_code_size)
937
{
938
#ifdef VIXL_DEBUG
939
m_emitter_check.reset();
940
m_far_emitter_check.reset();
941
#endif
942
943
m_emitter.FinalizeCode();
944
m_far_emitter.FinalizeCode();
945
946
u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();
947
*code_size = static_cast<u32>(m_emitter.GetCursorOffset());
948
*far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());
949
armAsm = nullptr;
950
return code;
951
}
952
953
const char* CPU::ARM64Recompiler::GetHostRegName(u32 reg) const
954
{
955
static constexpr std::array<const char*, 32> reg64_names = {
956
{"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
957
"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};
958
return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
959
}
960
961
void CPU::ARM64Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)
962
{
963
EmitMov(WRegister(reg), val);
964
}
965
966
void CPU::ARM64Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
967
{
968
armAsm->ldr(WRegister(reg), PTR(ptr));
969
}
970
971
void CPU::ARM64Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
972
{
973
armAsm->str(WRegister(reg), PTR(ptr));
974
}
975
976
void CPU::ARM64Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
977
{
978
if (val == 0)
979
{
980
armAsm->str(wzr, PTR(ptr));
981
return;
982
}
983
984
EmitMov(RWSCRATCH, val);
985
armAsm->str(RWSCRATCH, PTR(ptr));
986
}
987
988
void CPU::ARM64Recompiler::CopyHostReg(u32 dst, u32 src)
989
{
990
if (src != dst)
991
armAsm->mov(WRegister(dst), WRegister(src));
992
}
993
994
void CPU::ARM64Recompiler::AssertRegOrConstS(CompileFlags cf) const
995
{
996
DebugAssert(cf.valid_host_s || cf.const_s);
997
}
998
999
void CPU::ARM64Recompiler::AssertRegOrConstT(CompileFlags cf) const
1000
{
1001
DebugAssert(cf.valid_host_t || cf.const_t);
1002
}
1003
1004
vixl::aarch64::MemOperand CPU::ARM64Recompiler::MipsPtr(Reg r) const
1005
{
1006
DebugAssert(r < Reg::count);
1007
return PTR(&g_state.regs.r[static_cast<u32>(r)]);
1008
}
1009
1010
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegD(CompileFlags cf) const
1011
{
1012
DebugAssert(cf.valid_host_d);
1013
return WRegister(cf.host_d);
1014
}
1015
1016
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegS(CompileFlags cf) const
1017
{
1018
DebugAssert(cf.valid_host_s);
1019
return WRegister(cf.host_s);
1020
}
1021
1022
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegT(CompileFlags cf) const
1023
{
1024
DebugAssert(cf.valid_host_t);
1025
return WRegister(cf.host_t);
1026
}
1027
1028
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegLO(CompileFlags cf) const
1029
{
1030
DebugAssert(cf.valid_host_lo);
1031
return WRegister(cf.host_lo);
1032
}
1033
1034
vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegHI(CompileFlags cf) const
1035
{
1036
DebugAssert(cf.valid_host_hi);
1037
return WRegister(cf.host_hi);
1038
}
1039
1040
void CPU::ARM64Recompiler::MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf)
1041
{
1042
DebugAssert(dst.IsW());
1043
if (cf.valid_host_s)
1044
{
1045
if (cf.host_s != dst.GetCode())
1046
armAsm->mov(dst, WRegister(cf.host_s));
1047
}
1048
else if (cf.const_s)
1049
{
1050
const u32 cv = GetConstantRegU32(cf.MipsS());
1051
if (cv == 0)
1052
armAsm->mov(dst, wzr);
1053
else
1054
EmitMov(dst, cv);
1055
}
1056
else
1057
{
1058
WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));
1059
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));
1060
}
1061
}
1062
1063
void CPU::ARM64Recompiler::MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf)
1064
{
1065
DebugAssert(dst.IsW());
1066
if (cf.valid_host_t)
1067
{
1068
if (cf.host_t != dst.GetCode())
1069
armAsm->mov(dst, WRegister(cf.host_t));
1070
}
1071
else if (cf.const_t)
1072
{
1073
const u32 cv = GetConstantRegU32(cf.MipsT());
1074
if (cv == 0)
1075
armAsm->mov(dst, wzr);
1076
else
1077
EmitMov(dst, cv);
1078
}
1079
else
1080
{
1081
WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));
1082
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));
1083
}
1084
}
1085
1086
void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays)
1087
{
1088
DebugAssert(reg < Reg::count && dst.IsW());
1089
if (ignore_load_delays && m_load_delay_register == reg)
1090
{
1091
if (m_load_delay_value_register == NUM_HOST_REGS)
1092
armAsm->ldr(dst, PTR(&g_state.load_delay_value));
1093
else
1094
armAsm->mov(dst, WRegister(m_load_delay_value_register));
1095
}
1096
else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
1097
{
1098
armAsm->mov(dst, WRegister(hreg.value()));
1099
}
1100
else if (HasConstantReg(reg))
1101
{
1102
EmitMov(dst, GetConstantRegU32(reg));
1103
}
1104
else
1105
{
1106
armAsm->ldr(dst, MipsPtr(reg));
1107
}
1108
}
1109
1110
void CPU::ARM64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
1111
Reg arg3reg /* = Reg::count */)
1112
{
1113
DebugAssert(g_settings.gpu_pgxp_enable);
1114
1115
Flush(FLUSH_FOR_C_CALL);
1116
1117
if (arg2reg != Reg::count)
1118
MoveMIPSRegToReg(RWARG2, arg2reg);
1119
if (arg3reg != Reg::count)
1120
MoveMIPSRegToReg(RWARG3, arg3reg);
1121
1122
EmitMov(RWARG1, arg1val);
1123
EmitCall(func);
1124
}
1125
1126
void CPU::ARM64Recompiler::Flush(u32 flags)
1127
{
1128
Recompiler::Flush(flags);
1129
1130
if (flags & FLUSH_PC && m_dirty_pc)
1131
{
1132
StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
1133
m_dirty_pc = false;
1134
}
1135
1136
if (flags & FLUSH_INSTRUCTION_BITS)
1137
{
1138
// This sucks, but it's only used for fallbacks.
1139
EmitMov(RWARG1, inst->bits);
1140
EmitMov(RWARG2, m_current_instruction_pc);
1141
EmitMov(RWARG3, m_current_instruction_branch_delay_slot);
1142
armAsm->str(RWARG1, PTR(&g_state.current_instruction.bits));
1143
armAsm->str(RWARG2, PTR(&g_state.current_instruction_pc));
1144
armAsm->strb(RWARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));
1145
}
1146
1147
if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
1148
{
1149
// This sucks :(
1150
// TODO: make it a function?
1151
armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg));
1152
armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value));
1153
EmitMov(RWSCRATCH, OFFSETOF(CPU::State, regs.r[0]));
1154
armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2));
1155
armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1));
1156
EmitMov(RWSCRATCH, static_cast<u8>(Reg::count));
1157
armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
1158
m_load_delay_dirty = false;
1159
}
1160
1161
if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
1162
{
1163
if (m_load_delay_value_register != NUM_HOST_REGS)
1164
FreeHostReg(m_load_delay_value_register);
1165
1166
EmitMov(RWSCRATCH, static_cast<u8>(m_load_delay_register));
1167
armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
1168
m_load_delay_register = Reg::count;
1169
m_load_delay_dirty = true;
1170
}
1171
1172
if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
1173
{
1174
// May as well flush cycles while we're here.
1175
// GTE spanning blocks is very rare, we _could_ disable this for speed.
1176
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
1177
armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick));
1178
if (m_cycles > 0)
1179
{
1180
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
1181
m_cycles = 0;
1182
}
1183
armAsm->cmp(RWARG2, RWARG1);
1184
armAsm->csel(RWARG1, RWARG2, RWARG1, hs);
1185
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
1186
m_dirty_gte_done_cycle = false;
1187
}
1188
1189
if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
1190
{
1191
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
1192
1193
// update cycles at the same time
1194
if (flags & FLUSH_CYCLES && m_cycles > 0)
1195
{
1196
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
1197
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
1198
m_gte_done_cycle -= m_cycles;
1199
m_cycles = 0;
1200
}
1201
1202
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle));
1203
armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick));
1204
m_gte_done_cycle = 0;
1205
m_dirty_gte_done_cycle = true;
1206
}
1207
1208
if (flags & FLUSH_CYCLES && m_cycles > 0)
1209
{
1210
armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
1211
armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
1212
armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
1213
m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
1214
m_cycles = 0;
1215
}
1216
}
1217
1218
void CPU::ARM64Recompiler::Compile_Fallback()
1219
{
1220
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
1221
inst->bits);
1222
1223
Flush(FLUSH_FOR_INTERPRETER);
1224
1225
EmitCall(reinterpret_cast<const void*>(&CPU::RecompilerThunks::InterpretInstruction));
1226
1227
// TODO: make me less garbage
1228
// TODO: this is wrong, it flushes the load delay on the same cycle when we return.
1229
// but nothing should be going through here..
1230
Label no_load_delay;
1231
armAsm->ldrb(RWARG1, PTR(&g_state.next_load_delay_reg));
1232
armAsm->cmp(RWARG1, static_cast<u8>(Reg::count));
1233
armAsm->b(&no_load_delay, eq);
1234
armAsm->ldr(RWARG2, PTR(&g_state.next_load_delay_value));
1235
armAsm->strb(RWARG1, PTR(&g_state.load_delay_reg));
1236
armAsm->str(RWARG2, PTR(&g_state.load_delay_value));
1237
EmitMov(RWARG1, static_cast<u32>(Reg::count));
1238
armAsm->strb(RWARG1, PTR(&g_state.next_load_delay_reg));
1239
armAsm->bind(&no_load_delay);
1240
1241
m_load_delay_dirty = EMULATE_LOAD_DELAYS;
1242
}
1243
1244
void CPU::ARM64Recompiler::CheckBranchTarget(const vixl::aarch64::Register& pcreg)
1245
{
1246
DebugAssert(pcreg.IsW());
1247
if (!g_settings.cpu_recompiler_memory_exceptions)
1248
return;
1249
1250
armAsm->tst(pcreg, armCheckLogicalConstant(0x3));
1251
SwitchToFarCode(true, ne);
1252
1253
BackupHostState();
1254
EndBlockWithException(Exception::AdEL);
1255
1256
RestoreHostState();
1257
SwitchToNearCode(false);
1258
}
1259
1260
void CPU::ARM64Recompiler::Compile_jr(CompileFlags cf)
1261
{
1262
const Register pcreg = CFGetRegS(cf);
1263
CheckBranchTarget(pcreg);
1264
1265
armAsm->str(pcreg, PTR(&g_state.pc));
1266
1267
CompileBranchDelaySlot(false);
1268
EndBlock(std::nullopt, true);
1269
}
1270
1271
void CPU::ARM64Recompiler::Compile_jalr(CompileFlags cf)
1272
{
1273
const Register pcreg = CFGetRegS(cf);
1274
if (MipsD() != Reg::zero)
1275
SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
1276
1277
CheckBranchTarget(pcreg);
1278
armAsm->str(pcreg, PTR(&g_state.pc));
1279
1280
CompileBranchDelaySlot(false);
1281
EndBlock(std::nullopt, true);
1282
}
1283
1284
void CPU::ARM64Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
1285
{
1286
AssertRegOrConstS(cf);
1287
1288
const u32 taken_pc = GetConditionalBranchTarget(cf);
1289
1290
Flush(FLUSH_FOR_BRANCH);
1291
1292
DebugAssert(cf.valid_host_s);
1293
1294
// MipsT() here should equal zero for zero branches.
1295
DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
1296
1297
Label taken;
1298
const Register rs = CFGetRegS(cf);
1299
switch (cond)
1300
{
1301
case BranchCondition::Equal:
1302
case BranchCondition::NotEqual:
1303
{
1304
AssertRegOrConstT(cf);
1305
if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))
1306
{
1307
(cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken);
1308
}
1309
else
1310
{
1311
if (cf.valid_host_t)
1312
armAsm->cmp(rs, CFGetRegT(cf));
1313
else if (cf.const_t)
1314
armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));
1315
1316
armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne);
1317
}
1318
}
1319
break;
1320
1321
case BranchCondition::GreaterThanZero:
1322
{
1323
armAsm->cmp(rs, 0);
1324
armAsm->b(&taken, gt);
1325
}
1326
break;
1327
1328
case BranchCondition::GreaterEqualZero:
1329
{
1330
armAsm->cmp(rs, 0);
1331
armAsm->b(&taken, ge);
1332
}
1333
break;
1334
1335
case BranchCondition::LessThanZero:
1336
{
1337
armAsm->cmp(rs, 0);
1338
armAsm->b(&taken, lt);
1339
}
1340
break;
1341
1342
case BranchCondition::LessEqualZero:
1343
{
1344
armAsm->cmp(rs, 0);
1345
armAsm->b(&taken, le);
1346
}
1347
break;
1348
}
1349
1350
BackupHostState();
1351
if (!cf.delay_slot_swapped)
1352
CompileBranchDelaySlot();
1353
1354
EndBlock(m_compiler_pc, true);
1355
1356
armAsm->bind(&taken);
1357
1358
RestoreHostState();
1359
if (!cf.delay_slot_swapped)
1360
CompileBranchDelaySlot();
1361
1362
EndBlock(taken_pc, true);
1363
}
1364
1365
void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf, bool overflow)
1366
{
1367
const Register rs = CFGetRegS(cf);
1368
const Register rt = CFGetRegT(cf);
1369
if (const u32 imm = inst->i.imm_sext32(); imm != 0)
1370
{
1371
if (!overflow)
1372
{
1373
armAsm->add(rt, rs, armCheckAddSubConstant(imm));
1374
}
1375
else
1376
{
1377
armAsm->adds(rt, rs, armCheckAddSubConstant(imm));
1378
TestOverflow(rt);
1379
}
1380
}
1381
else if (rt.GetCode() != rs.GetCode())
1382
{
1383
armAsm->mov(rt, rs);
1384
}
1385
}
1386
1387
void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf)
1388
{
1389
Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
1390
}
1391
1392
void CPU::ARM64Recompiler::Compile_addiu(CompileFlags cf)
1393
{
1394
Compile_addi(cf, false);
1395
}
1396
1397
void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf)
1398
{
1399
Compile_slti(cf, true);
1400
}
1401
1402
void CPU::ARM64Recompiler::Compile_sltiu(CompileFlags cf)
1403
{
1404
Compile_slti(cf, false);
1405
}
1406
1407
void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf, bool sign)
1408
{
1409
armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));
1410
armAsm->cset(CFGetRegT(cf), sign ? lt : lo);
1411
}
1412
1413
void CPU::ARM64Recompiler::Compile_andi(CompileFlags cf)
1414
{
1415
const Register rt = CFGetRegT(cf);
1416
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1417
armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));
1418
else
1419
armAsm->mov(rt, wzr);
1420
}
1421
1422
void CPU::ARM64Recompiler::Compile_ori(CompileFlags cf)
1423
{
1424
const Register rt = CFGetRegT(cf);
1425
const Register rs = CFGetRegS(cf);
1426
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1427
armAsm->orr(rt, rs, armCheckLogicalConstant(imm));
1428
else if (rt.GetCode() != rs.GetCode())
1429
armAsm->mov(rt, rs);
1430
}
1431
1432
void CPU::ARM64Recompiler::Compile_xori(CompileFlags cf)
1433
{
1434
const Register rt = CFGetRegT(cf);
1435
const Register rs = CFGetRegS(cf);
1436
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1437
armAsm->eor(rt, rs, armCheckLogicalConstant(imm));
1438
else if (rt.GetCode() != rs.GetCode())
1439
armAsm->mov(rt, rs);
1440
}
1441
1442
void CPU::ARM64Recompiler::Compile_shift(CompileFlags cf,
1443
void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
1444
const vixl::aarch64::Register&, unsigned))
1445
{
1446
const Register rd = CFGetRegD(cf);
1447
const Register rt = CFGetRegT(cf);
1448
if (inst->r.shamt > 0)
1449
(armAsm->*op)(rd, rt, inst->r.shamt);
1450
else if (rd.GetCode() != rt.GetCode())
1451
armAsm->mov(rd, rt);
1452
}
1453
1454
void CPU::ARM64Recompiler::Compile_sll(CompileFlags cf)
1455
{
1456
Compile_shift(cf, &Assembler::lsl);
1457
}
1458
1459
void CPU::ARM64Recompiler::Compile_srl(CompileFlags cf)
1460
{
1461
Compile_shift(cf, &Assembler::lsr);
1462
}
1463
1464
void CPU::ARM64Recompiler::Compile_sra(CompileFlags cf)
1465
{
1466
Compile_shift(cf, &Assembler::asr);
1467
}
1468
1469
void CPU::ARM64Recompiler::Compile_variable_shift(
1470
CompileFlags cf,
1471
void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&,
1472
const vixl::aarch64::Register&),
1473
void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned))
1474
{
1475
const Register rd = CFGetRegD(cf);
1476
1477
AssertRegOrConstS(cf);
1478
AssertRegOrConstT(cf);
1479
1480
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1481
if (!cf.valid_host_t)
1482
MoveTToReg(rt, cf);
1483
1484
if (cf.const_s)
1485
{
1486
if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
1487
(armAsm->*op_const)(rd, rt, shift);
1488
else if (rd.GetCode() != rt.GetCode())
1489
armAsm->mov(rd, rt);
1490
}
1491
else
1492
{
1493
(armAsm->*op)(rd, rt, CFGetRegS(cf));
1494
}
1495
}
1496
1497
void CPU::ARM64Recompiler::Compile_sllv(CompileFlags cf)
1498
{
1499
Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl);
1500
}
1501
1502
void CPU::ARM64Recompiler::Compile_srlv(CompileFlags cf)
1503
{
1504
Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr);
1505
}
1506
1507
void CPU::ARM64Recompiler::Compile_srav(CompileFlags cf)
1508
{
1509
Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr);
1510
}
1511
1512
void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf, bool sign)
1513
{
1514
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1515
if (!cf.valid_host_s)
1516
MoveSToReg(rs, cf);
1517
1518
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1519
if (!cf.valid_host_t)
1520
MoveTToReg(rt, cf);
1521
1522
// TODO: if lo/hi gets killed, we can use a 32-bit multiply
1523
const Register lo = CFGetRegLO(cf);
1524
const Register hi = CFGetRegHI(cf);
1525
1526
(sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt);
1527
armAsm->lsr(hi.X(), lo.X(), 32);
1528
}
1529
1530
void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf)
1531
{
1532
Compile_mult(cf, true);
1533
}
1534
1535
void CPU::ARM64Recompiler::Compile_multu(CompileFlags cf)
1536
{
1537
Compile_mult(cf, false);
1538
}
1539
1540
void CPU::ARM64Recompiler::Compile_div(CompileFlags cf)
1541
{
1542
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1543
if (!cf.valid_host_s)
1544
MoveSToReg(rs, cf);
1545
1546
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1547
if (!cf.valid_host_t)
1548
MoveTToReg(rt, cf);
1549
1550
const Register rlo = CFGetRegLO(cf);
1551
const Register rhi = CFGetRegHI(cf);
1552
1553
// TODO: This could be slightly more optimal
1554
Label done;
1555
Label not_divide_by_zero;
1556
armAsm->cbnz(rt, &not_divide_by_zero);
1557
armAsm->mov(rhi, rs); // hi = num
1558
EmitMov(rlo, 1);
1559
EmitMov(RWSCRATCH, static_cast<u32>(-1));
1560
armAsm->cmp(rs, 0);
1561
armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 1
1562
armAsm->b(&done);
1563
1564
armAsm->bind(&not_divide_by_zero);
1565
Label not_unrepresentable;
1566
armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));
1567
armAsm->b(&not_unrepresentable, ne);
1568
armAsm->cmp(rt, armCheckCompareConstant(-1));
1569
armAsm->b(&not_unrepresentable, ne);
1570
1571
EmitMov(rlo, 0x80000000u);
1572
EmitMov(rhi, 0);
1573
armAsm->b(&done);
1574
1575
armAsm->bind(&not_unrepresentable);
1576
1577
armAsm->sdiv(rlo, rs, rt);
1578
1579
// TODO: skip when hi is dead
1580
armAsm->msub(rhi, rlo, rt, rs);
1581
1582
armAsm->bind(&done);
1583
}
1584
1585
void CPU::ARM64Recompiler::Compile_divu(CompileFlags cf)
1586
{
1587
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
1588
if (!cf.valid_host_s)
1589
MoveSToReg(rs, cf);
1590
1591
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
1592
if (!cf.valid_host_t)
1593
MoveTToReg(rt, cf);
1594
1595
const Register rlo = CFGetRegLO(cf);
1596
const Register rhi = CFGetRegHI(cf);
1597
1598
Label done;
1599
Label not_divide_by_zero;
1600
armAsm->cbnz(rt, &not_divide_by_zero);
1601
EmitMov(rlo, static_cast<u32>(-1));
1602
armAsm->mov(rhi, rs);
1603
armAsm->b(&done);
1604
1605
armAsm->bind(&not_divide_by_zero);
1606
1607
armAsm->udiv(rlo, rs, rt);
1608
1609
// TODO: skip when hi is dead
1610
armAsm->msub(rhi, rlo, rt, rs);
1611
1612
armAsm->bind(&done);
1613
}
1614
1615
void CPU::ARM64Recompiler::TestOverflow(const vixl::aarch64::Register& result)
1616
{
1617
DebugAssert(result.IsW());
1618
SwitchToFarCode(true, vs);
1619
1620
BackupHostState();
1621
1622
// toss the result
1623
ClearHostReg(result.GetCode());
1624
1625
EndBlockWithException(Exception::Ov);
1626
1627
RestoreHostState();
1628
1629
SwitchToNearCode(false);
1630
}
1631
1632
void CPU::ARM64Recompiler::Compile_dst_op(CompileFlags cf,
1633
void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
1634
const vixl::aarch64::Register&,
1635
const vixl::aarch64::Operand&),
1636
bool commutative, bool logical, bool overflow)
1637
{
1638
AssertRegOrConstS(cf);
1639
AssertRegOrConstT(cf);
1640
1641
const Register rd = CFGetRegD(cf);
1642
if (cf.valid_host_s && cf.valid_host_t)
1643
{
1644
(armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
1645
}
1646
else if (commutative && (cf.const_s || cf.const_t))
1647
{
1648
const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
1649
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1650
{
1651
(armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1652
}
1653
else
1654
{
1655
if (rd.GetCode() != src.GetCode())
1656
armAsm->mov(rd, src);
1657
overflow = false;
1658
}
1659
}
1660
else if (cf.const_s)
1661
{
1662
// TODO: Check where we can use wzr here
1663
EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS()));
1664
(armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf));
1665
}
1666
else if (cf.const_t)
1667
{
1668
const Register rs = CFGetRegS(cf);
1669
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1670
{
1671
(armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1672
}
1673
else
1674
{
1675
if (rd.GetCode() != rs.GetCode())
1676
armAsm->mov(rd, rs);
1677
overflow = false;
1678
}
1679
}
1680
1681
if (overflow)
1682
TestOverflow(rd);
1683
}
1684
1685
void CPU::ARM64Recompiler::Compile_add(CompileFlags cf)
1686
{
1687
if (g_settings.cpu_recompiler_memory_exceptions)
1688
Compile_dst_op(cf, &Assembler::adds, true, false, true);
1689
else
1690
Compile_dst_op(cf, &Assembler::add, true, false, false);
1691
}
1692
1693
void CPU::ARM64Recompiler::Compile_addu(CompileFlags cf)
1694
{
1695
Compile_dst_op(cf, &Assembler::add, true, false, false);
1696
}
1697
1698
void CPU::ARM64Recompiler::Compile_sub(CompileFlags cf)
1699
{
1700
if (g_settings.cpu_recompiler_memory_exceptions)
1701
Compile_dst_op(cf, &Assembler::subs, false, false, true);
1702
else
1703
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1704
}
1705
1706
void CPU::ARM64Recompiler::Compile_subu(CompileFlags cf)
1707
{
1708
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1709
}
1710
1711
void CPU::ARM64Recompiler::Compile_and(CompileFlags cf)
1712
{
1713
AssertRegOrConstS(cf);
1714
AssertRegOrConstT(cf);
1715
1716
// special cases - and with self -> self, and with 0 -> 0
1717
const Register regd = CFGetRegD(cf);
1718
if (cf.MipsS() == cf.MipsT())
1719
{
1720
armAsm->mov(regd, CFGetRegS(cf));
1721
return;
1722
}
1723
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1724
{
1725
armAsm->mov(regd, wzr);
1726
return;
1727
}
1728
1729
Compile_dst_op(cf, &Assembler::and_, true, true, false);
1730
}
1731
1732
void CPU::ARM64Recompiler::Compile_or(CompileFlags cf)
1733
{
1734
AssertRegOrConstS(cf);
1735
AssertRegOrConstT(cf);
1736
1737
// or/nor with 0 -> no effect
1738
const Register regd = CFGetRegD(cf);
1739
if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
1740
{
1741
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1742
return;
1743
}
1744
1745
Compile_dst_op(cf, &Assembler::orr, true, true, false);
1746
}
1747
1748
void CPU::ARM64Recompiler::Compile_xor(CompileFlags cf)
1749
{
1750
AssertRegOrConstS(cf);
1751
AssertRegOrConstT(cf);
1752
1753
const Register regd = CFGetRegD(cf);
1754
if (cf.MipsS() == cf.MipsT())
1755
{
1756
// xor with self -> zero
1757
armAsm->mov(regd, wzr);
1758
return;
1759
}
1760
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1761
{
1762
// xor with zero -> no effect
1763
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1764
return;
1765
}
1766
1767
Compile_dst_op(cf, &Assembler::eor, true, true, false);
1768
}
1769
1770
void CPU::ARM64Recompiler::Compile_nor(CompileFlags cf)
1771
{
1772
Compile_or(cf);
1773
armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));
1774
}
1775
1776
void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf)
1777
{
1778
Compile_slt(cf, true);
1779
}
1780
1781
void CPU::ARM64Recompiler::Compile_sltu(CompileFlags cf)
1782
{
1783
Compile_slt(cf, false);
1784
}
1785
1786
void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf, bool sign)
1787
{
1788
AssertRegOrConstS(cf);
1789
AssertRegOrConstT(cf);
1790
1791
// TODO: swap and reverse op for constants
1792
if (cf.const_s)
1793
{
1794
EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS()));
1795
armAsm->cmp(RWSCRATCH, CFGetRegT(cf));
1796
}
1797
else if (cf.const_t)
1798
{
1799
armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));
1800
}
1801
else
1802
{
1803
armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));
1804
}
1805
1806
armAsm->cset(CFGetRegD(cf), sign ? lt : lo);
1807
}
1808
1809
vixl::aarch64::Register
1810
CPU::ARM64Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,
1811
const std::optional<const vixl::aarch64::Register>& reg)
1812
{
1813
const u32 imm = inst->i.imm_sext32();
1814
if (cf.valid_host_s && imm == 0 && !reg.has_value())
1815
return CFGetRegS(cf);
1816
1817
const Register dst = reg.has_value() ? reg.value() : RWARG1;
1818
if (address.has_value())
1819
{
1820
EmitMov(dst, address.value());
1821
}
1822
else if (imm == 0)
1823
{
1824
if (cf.valid_host_s)
1825
{
1826
if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())
1827
armAsm->mov(dst, CFGetRegS(cf));
1828
}
1829
else
1830
{
1831
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1832
}
1833
}
1834
else
1835
{
1836
if (cf.valid_host_s)
1837
{
1838
armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1839
}
1840
else
1841
{
1842
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1843
armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1844
}
1845
}
1846
1847
return dst;
1848
}
1849
1850
template<typename RegAllocFn>
1851
vixl::aarch64::Register CPU::ARM64Recompiler::GenerateLoad(const vixl::aarch64::Register& addr_reg,
1852
MemoryAccessSize size, bool sign, bool use_fastmem,
1853
const RegAllocFn& dst_reg_alloc)
1854
{
1855
DebugAssert(addr_reg.IsW());
1856
if (use_fastmem)
1857
{
1858
m_cycles += Bus::RAM_READ_TICKS;
1859
1860
const Register dst = dst_reg_alloc();
1861
1862
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1863
{
1864
DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
1865
armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1866
armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));
1867
}
1868
1869
const MemOperand mem =
1870
MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
1871
u8* start = armAsm->GetCursorAddress<u8*>();
1872
switch (size)
1873
{
1874
case MemoryAccessSize::Byte:
1875
sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);
1876
break;
1877
1878
case MemoryAccessSize::HalfWord:
1879
sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);
1880
break;
1881
1882
case MemoryAccessSize::Word:
1883
armAsm->ldr(dst, mem);
1884
break;
1885
}
1886
1887
AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true);
1888
return dst;
1889
}
1890
1891
if (addr_reg.GetCode() != RWARG1.GetCode())
1892
armAsm->mov(RWARG1, addr_reg);
1893
1894
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1895
switch (size)
1896
{
1897
case MemoryAccessSize::Byte:
1898
{
1899
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryByte) :
1900
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte));
1901
}
1902
break;
1903
case MemoryAccessSize::HalfWord:
1904
{
1905
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryHalfWord) :
1906
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord));
1907
}
1908
break;
1909
case MemoryAccessSize::Word:
1910
{
1911
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryWord) :
1912
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord));
1913
}
1914
break;
1915
}
1916
1917
// TODO: turn this into an asm function instead
1918
if (checked)
1919
{
1920
SwitchToFarCodeIfBitSet(RXRET, 63);
1921
BackupHostState();
1922
1923
// Need to stash this in a temp because of the flush.
1924
const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
1925
armAsm->neg(temp.X(), RXRET);
1926
armAsm->lsl(temp, temp, 2);
1927
1928
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1929
1930
// cause_bits = (-result << 2) | BD | cop_n
1931
armAsm->orr(RWARG1, temp,
1932
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
1933
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
1934
EmitMov(RWARG2, m_current_instruction_pc);
1935
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1936
FreeHostReg(temp.GetCode());
1937
EndBlock(std::nullopt, true);
1938
1939
RestoreHostState();
1940
SwitchToNearCode(false);
1941
}
1942
1943
const Register dst_reg = dst_reg_alloc();
1944
switch (size)
1945
{
1946
case MemoryAccessSize::Byte:
1947
{
1948
sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET);
1949
}
1950
break;
1951
case MemoryAccessSize::HalfWord:
1952
{
1953
sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET);
1954
}
1955
break;
1956
case MemoryAccessSize::Word:
1957
{
1958
if (dst_reg.GetCode() != RWRET.GetCode())
1959
armAsm->mov(dst_reg, RWRET);
1960
}
1961
break;
1962
}
1963
1964
return dst_reg;
1965
}
1966
1967
void CPU::ARM64Recompiler::GenerateStore(const vixl::aarch64::Register& addr_reg,
1968
const vixl::aarch64::Register& value_reg, MemoryAccessSize size,
1969
bool use_fastmem)
1970
{
1971
DebugAssert(addr_reg.IsW() && value_reg.IsW());
1972
if (use_fastmem)
1973
{
1974
if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
1975
{
1976
DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
1977
armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1978
armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));
1979
}
1980
1981
const MemOperand mem =
1982
MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
1983
u8* start = armAsm->GetCursorAddress<u8*>();
1984
switch (size)
1985
{
1986
case MemoryAccessSize::Byte:
1987
armAsm->strb(value_reg, mem);
1988
break;
1989
1990
case MemoryAccessSize::HalfWord:
1991
armAsm->strh(value_reg, mem);
1992
break;
1993
1994
case MemoryAccessSize::Word:
1995
armAsm->str(value_reg, mem);
1996
break;
1997
}
1998
AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);
1999
return;
2000
}
2001
2002
if (addr_reg.GetCode() != RWARG1.GetCode())
2003
armAsm->mov(RWARG1, addr_reg);
2004
if (value_reg.GetCode() != RWARG2.GetCode())
2005
armAsm->mov(RWARG2, value_reg);
2006
2007
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
2008
switch (size)
2009
{
2010
case MemoryAccessSize::Byte:
2011
{
2012
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryByte) :
2013
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte));
2014
}
2015
break;
2016
case MemoryAccessSize::HalfWord:
2017
{
2018
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryHalfWord) :
2019
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord));
2020
}
2021
break;
2022
case MemoryAccessSize::Word:
2023
{
2024
EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryWord) :
2025
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord));
2026
}
2027
break;
2028
}
2029
2030
// TODO: turn this into an asm function instead
2031
if (checked)
2032
{
2033
SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true);
2034
BackupHostState();
2035
2036
// Need to stash this in a temp because of the flush.
2037
const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
2038
armAsm->lsl(temp, RWRET, 2);
2039
2040
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
2041
2042
// cause_bits = (result << 2) | BD | cop_n
2043
armAsm->orr(RWARG1, temp,
2044
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
2045
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
2046
EmitMov(RWARG2, m_current_instruction_pc);
2047
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
2048
FreeHostReg(temp.GetCode());
2049
EndBlock(std::nullopt, true);
2050
2051
RestoreHostState();
2052
SwitchToNearCode(false);
2053
}
2054
}
2055
2056
void CPU::ARM64Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2057
const std::optional<VirtualMemoryAddress>& address)
2058
{
2059
const std::optional<WRegister> addr_reg =
2060
g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2061
std::optional<WRegister>();
2062
FlushForLoadStore(address, false, use_fastmem);
2063
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2064
const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() -> Register {
2065
if (cf.MipsT() == Reg::zero)
2066
return RWRET;
2067
2068
return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2069
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,
2070
cf.MipsT()));
2071
});
2072
2073
if (g_settings.gpu_pgxp_enable)
2074
{
2075
Flush(FLUSH_FOR_C_CALL);
2076
2077
EmitMov(RWARG1, inst->bits);
2078
armAsm->mov(RWARG2, addr);
2079
armAsm->mov(RWARG3, data);
2080
EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
2081
FreeHostReg(addr_reg.value().GetCode());
2082
}
2083
}
2084
2085
void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2086
const std::optional<VirtualMemoryAddress>& address)
2087
{
2088
DebugAssert(size == MemoryAccessSize::Word && !sign);
2089
2090
const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
2091
FlushForLoadStore(address, false, use_fastmem);
2092
2093
// TODO: if address is constant, this can be simplified..
2094
2095
// If we're coming from another block, just flush the load delay and hope for the best..
2096
if (m_load_delay_dirty)
2097
UpdateLoadDelay();
2098
2099
// We'd need to be careful here if we weren't overwriting it..
2100
ComputeLoadStoreAddressArg(cf, address, addr);
2101
2102
// Do PGXP first, it does its own load.
2103
if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
2104
{
2105
Flush(FLUSH_FOR_C_CALL);
2106
EmitMov(RWARG1, inst->bits);
2107
armAsm->mov(RWARG2, addr);
2108
MoveMIPSRegToReg(RWARG3, inst->r.rt, true);
2109
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
2110
}
2111
2112
armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
2113
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
2114
2115
if (inst->r.rt == Reg::zero)
2116
{
2117
FreeHostReg(addr.GetCode());
2118
return;
2119
}
2120
2121
// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
2122
// never written back. NOTE: can't trust T in cf because of the flush
2123
const Reg rt = inst->r.rt;
2124
Register value;
2125
if (m_load_delay_register == rt)
2126
{
2127
const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
2128
AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
2129
m_load_delay_value_register;
2130
RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
2131
value = WRegister(existing_ld_rt);
2132
}
2133
else
2134
{
2135
if constexpr (EMULATE_LOAD_DELAYS)
2136
{
2137
value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
2138
if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
2139
armAsm->mov(value, WRegister(rtreg.value()));
2140
else if (HasConstantReg(rt))
2141
EmitMov(value, GetConstantRegU32(rt));
2142
else
2143
armAsm->ldr(value, MipsPtr(rt));
2144
}
2145
else
2146
{
2147
value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
2148
}
2149
}
2150
2151
DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
2152
armAsm->and_(RWARG2, addr, 3);
2153
armAsm->lsl(RWARG2, RWARG2, 3); // *8
2154
EmitMov(RWARG3, 24);
2155
armAsm->sub(RWARG3, RWARG3, RWARG2);
2156
2157
if (inst->op == InstructionOp::lwl)
2158
{
2159
// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
2160
// new_value = (value & mask) | (RWRET << (24 - shift));
2161
EmitMov(RWSCRATCH, 0xFFFFFFu);
2162
armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG2);
2163
armAsm->and_(value, value, RWSCRATCH);
2164
armAsm->lslv(RWRET, RWRET, RWARG3);
2165
armAsm->orr(value, value, RWRET);
2166
}
2167
else
2168
{
2169
// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
2170
// new_value = (value & mask) | (RWRET >> shift);
2171
armAsm->lsrv(RWRET, RWRET, RWARG2);
2172
EmitMov(RWSCRATCH, 0xFFFFFF00u);
2173
armAsm->lslv(RWSCRATCH, RWSCRATCH, RWARG3);
2174
armAsm->and_(value, value, RWSCRATCH);
2175
armAsm->orr(value, value, RWRET);
2176
}
2177
2178
FreeHostReg(addr.GetCode());
2179
}
2180
2181
void CPU::ARM64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2182
const std::optional<VirtualMemoryAddress>& address)
2183
{
2184
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2185
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2186
const std::optional<WRegister> addr_reg =
2187
g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2188
std::optional<WRegister>();
2189
FlushForLoadStore(address, false, use_fastmem);
2190
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2191
const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
2192
return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
2193
WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2194
RWRET;
2195
});
2196
2197
switch (action)
2198
{
2199
case GTERegisterAccessAction::Ignore:
2200
{
2201
break;
2202
}
2203
2204
case GTERegisterAccessAction::Direct:
2205
{
2206
armAsm->str(value, PTR(ptr));
2207
break;
2208
}
2209
2210
case GTERegisterAccessAction::SignExtend16:
2211
{
2212
armAsm->sxth(RWARG3, value);
2213
armAsm->str(RWARG3, PTR(ptr));
2214
break;
2215
}
2216
2217
case GTERegisterAccessAction::ZeroExtend16:
2218
{
2219
armAsm->uxth(RWARG3, value);
2220
armAsm->str(RWARG3, PTR(ptr));
2221
break;
2222
}
2223
2224
case GTERegisterAccessAction::CallHandler:
2225
{
2226
Flush(FLUSH_FOR_C_CALL);
2227
armAsm->mov(RWARG2, value);
2228
EmitMov(RWARG1, index);
2229
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2230
break;
2231
}
2232
2233
case GTERegisterAccessAction::PushFIFO:
2234
{
2235
// SXY0 <- SXY1
2236
// SXY1 <- SXY2
2237
// SXY2 <- SXYP
2238
DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
2239
armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
2240
armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
2241
armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
2242
armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
2243
armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));
2244
break;
2245
}
2246
2247
default:
2248
{
2249
Panic("Unknown action");
2250
return;
2251
}
2252
}
2253
2254
if (g_settings.gpu_pgxp_enable)
2255
{
2256
Flush(FLUSH_FOR_C_CALL);
2257
armAsm->mov(RWARG3, value);
2258
if (value.GetCode() != RWRET.GetCode())
2259
FreeHostReg(value.GetCode());
2260
armAsm->mov(RWARG2, addr);
2261
FreeHostReg(addr_reg.value().GetCode());
2262
EmitMov(RWARG1, inst->bits);
2263
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
2264
}
2265
}
2266
2267
void CPU::ARM64Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2268
const std::optional<VirtualMemoryAddress>& address)
2269
{
2270
AssertRegOrConstS(cf);
2271
AssertRegOrConstT(cf);
2272
2273
const std::optional<WRegister> addr_reg =
2274
g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2275
std::optional<WRegister>();
2276
FlushForLoadStore(address, true, use_fastmem);
2277
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2278
const Register data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
2279
if (!cf.valid_host_t)
2280
MoveTToReg(RWARG2, cf);
2281
2282
GenerateStore(addr, data, size, use_fastmem);
2283
2284
if (g_settings.gpu_pgxp_enable)
2285
{
2286
Flush(FLUSH_FOR_C_CALL);
2287
MoveMIPSRegToReg(RWARG3, cf.MipsT());
2288
armAsm->mov(RWARG2, addr);
2289
EmitMov(RWARG1, inst->bits);
2290
EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
2291
FreeHostReg(addr_reg.value().GetCode());
2292
}
2293
}
2294
2295
void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2296
const std::optional<VirtualMemoryAddress>& address)
2297
{
2298
DebugAssert(size == MemoryAccessSize::Word && !sign);
2299
2300
// TODO: this can take over rt's value if it's no longer needed
2301
// NOTE: can't trust T in cf because of the alloc
2302
const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
2303
2304
FlushForLoadStore(address, true, use_fastmem);
2305
2306
// TODO: if address is constant, this can be simplified..
2307
// We'd need to be careful here if we weren't overwriting it..
2308
ComputeLoadStoreAddressArg(cf, address, addr);
2309
2310
if (g_settings.gpu_pgxp_enable)
2311
{
2312
Flush(FLUSH_FOR_C_CALL);
2313
EmitMov(RWARG1, inst->bits);
2314
armAsm->mov(RWARG2, addr);
2315
MoveMIPSRegToReg(RWARG3, inst->r.rt);
2316
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
2317
}
2318
2319
armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
2320
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
2321
2322
armAsm->and_(RWSCRATCH, addr, 3);
2323
armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8
2324
armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u));
2325
2326
MoveMIPSRegToReg(RWARG2, inst->r.rt);
2327
2328
if (inst->op == InstructionOp::swl)
2329
{
2330
// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
2331
// new_value = (RWRET & mem_mask) | (value >> (24 - shift));
2332
EmitMov(RWARG3, 0xFFFFFF00u);
2333
armAsm->lslv(RWARG3, RWARG3, RWSCRATCH);
2334
armAsm->and_(RWRET, RWRET, RWARG3);
2335
2336
EmitMov(RWARG3, 24);
2337
armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
2338
armAsm->lsrv(RWARG2, RWARG2, RWARG3);
2339
armAsm->orr(RWARG2, RWARG2, RWRET);
2340
}
2341
else
2342
{
2343
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
2344
// new_value = (RWRET & mem_mask) | (value << shift);
2345
armAsm->lslv(RWARG2, RWARG2, RWSCRATCH);
2346
2347
EmitMov(RWARG3, 24);
2348
armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
2349
EmitMov(RWSCRATCH, 0x00FFFFFFu);
2350
armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);
2351
armAsm->and_(RWRET, RWRET, RWSCRATCH);
2352
armAsm->orr(RWARG2, RWARG2, RWRET);
2353
}
2354
2355
GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem);
2356
FreeHostReg(addr.GetCode());
2357
}
2358
2359
void CPU::ARM64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2360
const std::optional<VirtualMemoryAddress>& address)
2361
{
2362
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2363
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2364
const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
2365
WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2366
RWARG1;
2367
const Register data = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
2368
FlushForLoadStore(address, true, use_fastmem);
2369
ComputeLoadStoreAddressArg(cf, address, addr);
2370
2371
switch (action)
2372
{
2373
case GTERegisterAccessAction::Direct:
2374
{
2375
armAsm->ldr(data, PTR(ptr));
2376
}
2377
break;
2378
2379
case GTERegisterAccessAction::CallHandler:
2380
{
2381
// should already be flushed.. except in fastmem case
2382
Flush(FLUSH_FOR_C_CALL);
2383
EmitMov(RWARG1, index);
2384
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2385
armAsm->mov(data, RWRET);
2386
}
2387
break;
2388
2389
default:
2390
{
2391
Panic("Unknown action");
2392
}
2393
break;
2394
}
2395
2396
GenerateStore(addr, data, size, use_fastmem);
2397
if (!g_settings.gpu_pgxp_enable)
2398
{
2399
if (addr.GetCode() != RWARG1.GetCode())
2400
FreeHostReg(addr.GetCode());
2401
}
2402
else
2403
{
2404
// TODO: This can be simplified because we don't need to validate in PGXP..
2405
Flush(FLUSH_FOR_C_CALL);
2406
armAsm->mov(RWARG3, data);
2407
FreeHostReg(data.GetCode());
2408
armAsm->mov(RWARG2, addr);
2409
FreeHostReg(addr.GetCode());
2410
EmitMov(RWARG1, inst->bits);
2411
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
2412
}
2413
}
2414
2415
void CPU::ARM64Recompiler::Compile_mtc0(CompileFlags cf)
2416
{
2417
// TODO: we need better constant setting here.. which will need backprop
2418
AssertRegOrConstT(cf);
2419
2420
const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
2421
const u32* ptr = GetCop0RegPtr(reg);
2422
const u32 mask = GetCop0RegWriteMask(reg);
2423
if (!ptr)
2424
{
2425
Compile_Fallback();
2426
return;
2427
}
2428
2429
if (mask == 0)
2430
{
2431
// if it's a read-only register, ignore
2432
DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
2433
return;
2434
}
2435
2436
// for some registers, we need to test certain bits
2437
const bool needs_bit_test = (reg == Cop0Reg::SR);
2438
const Register new_value = RWARG1;
2439
const Register old_value = RWARG2;
2440
const Register changed_bits = RWARG3;
2441
const Register mask_reg = RWSCRATCH;
2442
2443
// Load old value
2444
armAsm->ldr(old_value, PTR(ptr));
2445
2446
// No way we fit this in an immediate..
2447
EmitMov(mask_reg, mask);
2448
2449
// update value
2450
if (cf.valid_host_t)
2451
armAsm->and_(new_value, CFGetRegT(cf), mask_reg);
2452
else
2453
EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
2454
2455
if (needs_bit_test)
2456
armAsm->eor(changed_bits, old_value, new_value);
2457
armAsm->bic(old_value, old_value, mask_reg);
2458
armAsm->orr(new_value, old_value, new_value);
2459
armAsm->str(new_value, PTR(ptr));
2460
2461
if (reg == Cop0Reg::SR)
2462
{
2463
// TODO: replace with register backup
2464
// We could just inline the whole thing..
2465
Flush(FLUSH_FOR_C_CALL);
2466
2467
Label caches_unchanged;
2468
armAsm->tbz(changed_bits, 16, &caches_unchanged);
2469
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
2470
armAsm->ldr(RWARG1, PTR(ptr)); // reload value for interrupt test below
2471
if (CodeCache::IsUsingFastmem())
2472
armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
2473
armAsm->bind(&caches_unchanged);
2474
2475
TestInterrupts(RWARG1);
2476
}
2477
else if (reg == Cop0Reg::CAUSE)
2478
{
2479
armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
2480
TestInterrupts(RWARG1);
2481
}
2482
else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)
2483
{
2484
// need to check whether we're switching to debug mode
2485
Flush(FLUSH_FOR_C_CALL);
2486
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));
2487
SwitchToFarCodeIfRegZeroOrNonZero(RWRET, true);
2488
BackupHostState();
2489
Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);
2490
EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return
2491
RestoreHostState();
2492
SwitchToNearCode(false);
2493
}
2494
}
2495
2496
void CPU::ARM64Recompiler::Compile_rfe(CompileFlags cf)
2497
{
2498
// shift mode bits right two, preserving upper bits
2499
armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
2500
armAsm->bfxil(RWARG1, RWARG1, 2, 4);
2501
armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
2502
2503
TestInterrupts(RWARG1);
2504
}
2505
2506
void CPU::ARM64Recompiler::TestInterrupts(const vixl::aarch64::Register& sr)
2507
{
2508
DebugAssert(sr.IsW());
2509
2510
// if Iec == 0 then goto no_interrupt
2511
Label no_interrupt;
2512
armAsm->tbz(sr, 0, &no_interrupt);
2513
2514
// sr & cause
2515
armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
2516
armAsm->and_(sr, sr, RWSCRATCH);
2517
2518
// ((sr & cause) & 0xff00) == 0 goto no_interrupt
2519
armAsm->tst(sr, 0xFF00);
2520
2521
SwitchToFarCode(true, ne);
2522
BackupHostState();
2523
2524
// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
2525
UpdateLoadDelay();
2526
2527
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
2528
2529
// Can't use EndBlockWithException() here, because it'll use the wrong PC.
2530
// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
2531
if (!iinfo->is_last_instruction)
2532
{
2533
EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
2534
(inst + 1)->cop.cop_n));
2535
EmitMov(RWARG2, m_compiler_pc);
2536
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
2537
m_dirty_pc = false;
2538
EndAndLinkBlock(std::nullopt, true, false);
2539
}
2540
else
2541
{
2542
if (m_dirty_pc)
2543
EmitMov(RWARG1, m_compiler_pc);
2544
armAsm->str(wzr, PTR(&g_state.downcount));
2545
if (m_dirty_pc)
2546
armAsm->str(RWARG1, PTR(&g_state.pc));
2547
m_dirty_pc = false;
2548
EndAndLinkBlock(std::nullopt, false, true);
2549
}
2550
2551
RestoreHostState();
2552
SwitchToNearCode(false);
2553
2554
armAsm->bind(&no_interrupt);
2555
}
2556
2557
void CPU::ARM64Recompiler::Compile_mfc2(CompileFlags cf)
2558
{
2559
const u32 index = inst->cop.Cop2Index();
2560
const Reg rt = inst->r.rt;
2561
2562
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2563
if (action == GTERegisterAccessAction::Ignore)
2564
return;
2565
2566
u32 hreg;
2567
if (action == GTERegisterAccessAction::Direct)
2568
{
2569
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2570
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2571
armAsm->ldr(WRegister(hreg), PTR(ptr));
2572
}
2573
else if (action == GTERegisterAccessAction::CallHandler)
2574
{
2575
Flush(FLUSH_FOR_C_CALL);
2576
EmitMov(RWARG1, index);
2577
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2578
2579
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2580
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2581
armAsm->mov(WRegister(hreg), RWRET);
2582
}
2583
else
2584
{
2585
Panic("Unknown action");
2586
return;
2587
}
2588
2589
if (g_settings.gpu_pgxp_enable)
2590
{
2591
Flush(FLUSH_FOR_C_CALL);
2592
EmitMov(RWARG1, inst->bits);
2593
armAsm->mov(RWARG2, WRegister(hreg));
2594
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
2595
}
2596
}
2597
2598
void CPU::ARM64Recompiler::Compile_mtc2(CompileFlags cf)
2599
{
2600
const u32 index = inst->cop.Cop2Index();
2601
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2602
if (action == GTERegisterAccessAction::Ignore)
2603
return;
2604
2605
if (action == GTERegisterAccessAction::Direct)
2606
{
2607
if (cf.const_t)
2608
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
2609
else
2610
armAsm->str(CFGetRegT(cf), PTR(ptr));
2611
}
2612
else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
2613
{
2614
const bool sign = (action == GTERegisterAccessAction::SignExtend16);
2615
if (cf.valid_host_t)
2616
{
2617
sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf));
2618
armAsm->str(RWARG1, PTR(ptr));
2619
}
2620
else if (cf.const_t)
2621
{
2622
const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
2623
StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
2624
}
2625
else
2626
{
2627
Panic("Unsupported setup");
2628
}
2629
}
2630
else if (action == GTERegisterAccessAction::CallHandler)
2631
{
2632
Flush(FLUSH_FOR_C_CALL);
2633
EmitMov(RWARG1, index);
2634
MoveTToReg(RWARG2, cf);
2635
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2636
}
2637
else if (action == GTERegisterAccessAction::PushFIFO)
2638
{
2639
// SXY0 <- SXY1
2640
// SXY1 <- SXY2
2641
// SXY2 <- SXYP
2642
DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());
2643
armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
2644
armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
2645
armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
2646
armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
2647
if (cf.valid_host_t)
2648
armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
2649
else if (cf.const_t)
2650
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
2651
else
2652
Panic("Unsupported setup");
2653
}
2654
else
2655
{
2656
Panic("Unknown action");
2657
}
2658
}
2659
2660
void CPU::ARM64Recompiler::Compile_cop2(CompileFlags cf)
2661
{
2662
TickCount func_ticks;
2663
GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
2664
2665
Flush(FLUSH_FOR_C_CALL);
2666
EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
2667
EmitCall(reinterpret_cast<const void*>(func));
2668
2669
AddGTETicks(func_ticks);
2670
}
2671
2672
u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
2673
TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
2674
u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
2675
bool is_load)
2676
{
2677
Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
2678
Assembler* armAsm = &arm_asm;
2679
2680
#ifdef VIXL_DEBUG
2681
vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
2682
#endif
2683
2684
static constexpr u32 GPR_SIZE = 8;
2685
2686
// save regs
2687
u32 num_gprs = 0;
2688
2689
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2690
{
2691
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2692
num_gprs++;
2693
}
2694
2695
const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
2696
2697
// TODO: use stp+ldp, vixl helper?
2698
2699
if (stack_size > 0)
2700
{
2701
armAsm->sub(sp, sp, stack_size);
2702
2703
u32 stack_offset = 0;
2704
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2705
{
2706
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2707
{
2708
armAsm->str(XRegister(i), MemOperand(sp, stack_offset));
2709
stack_offset += GPR_SIZE;
2710
}
2711
}
2712
}
2713
2714
if (cycles_to_add != 0)
2715
{
2716
// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
2717
Assert(Assembler::IsImmAddSub(cycles_to_add));
2718
armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
2719
armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add);
2720
armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
2721
}
2722
2723
if (address_register != static_cast<u8>(RWARG1.GetCode()))
2724
armAsm->mov(RWARG1, WRegister(address_register));
2725
2726
if (!is_load)
2727
{
2728
if (data_register != static_cast<u8>(RWARG2.GetCode()))
2729
armAsm->mov(RWARG2, WRegister(data_register));
2730
}
2731
2732
switch (size)
2733
{
2734
case MemoryAccessSize::Byte:
2735
{
2736
armEmitCall(armAsm,
2737
is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte) :
2738
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte),
2739
false);
2740
}
2741
break;
2742
case MemoryAccessSize::HalfWord:
2743
{
2744
armEmitCall(armAsm,
2745
is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord) :
2746
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord),
2747
false);
2748
}
2749
break;
2750
case MemoryAccessSize::Word:
2751
{
2752
armEmitCall(armAsm,
2753
is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord) :
2754
reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord),
2755
false);
2756
}
2757
break;
2758
}
2759
2760
if (is_load)
2761
{
2762
const WRegister dst = WRegister(data_register);
2763
switch (size)
2764
{
2765
case MemoryAccessSize::Byte:
2766
{
2767
is_signed ? armAsm->sxtb(dst, RWRET) : armAsm->uxtb(dst, RWRET);
2768
}
2769
break;
2770
case MemoryAccessSize::HalfWord:
2771
{
2772
is_signed ? armAsm->sxth(dst, RWRET) : armAsm->uxth(dst, RWRET);
2773
}
2774
break;
2775
case MemoryAccessSize::Word:
2776
{
2777
if (dst.GetCode() != RWRET.GetCode())
2778
armAsm->mov(dst, RWRET);
2779
}
2780
break;
2781
}
2782
}
2783
2784
if (cycles_to_remove != 0)
2785
{
2786
Assert(Assembler::IsImmAddSub(cycles_to_remove));
2787
armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
2788
armAsm->sub(RWSCRATCH, RWSCRATCH, cycles_to_remove);
2789
armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
2790
}
2791
2792
// restore regs
2793
if (stack_size > 0)
2794
{
2795
u32 stack_offset = 0;
2796
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2797
{
2798
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2799
{
2800
armAsm->ldr(XRegister(i), MemOperand(sp, stack_offset));
2801
stack_offset += GPR_SIZE;
2802
}
2803
}
2804
2805
armAsm->add(sp, sp, stack_size);
2806
}
2807
2808
armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);
2809
armAsm->FinalizeCode();
2810
2811
return static_cast<u32>(armAsm->GetCursorOffset());
2812
}
2813
2814
#endif // CPU_ARCH_ARM64
2815
2816