Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/core/cpu_recompiler_arm32.cpp
4802 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "cpu_recompiler_arm32.h"
5
#include "cpu_core_private.h"
6
#include "cpu_pgxp.h"
7
#include "gte.h"
8
#include "settings.h"
9
#include "timing_event.h"
10
11
#include "common/align.h"
12
#include "common/assert.h"
13
#include "common/log.h"
14
#include "common/memmap.h"
15
#include "common/string_util.h"
16
17
#include <limits>
18
19
#ifdef CPU_ARCH_ARM32
20
21
#include "vixl/aarch32/constants-aarch32.h"
22
#include "vixl/aarch32/instructions-aarch32.h"
23
24
#ifdef ENABLE_HOST_DISASSEMBLY
25
#include "vixl/aarch32/disasm-aarch32.h"
26
#include <iostream>
27
#endif
28
29
LOG_CHANNEL(Recompiler);
30
31
#define PTR(x) vixl::aarch32::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))
32
#define RMEMBASE vixl::aarch32::r3
33
34
#define RRET vixl::aarch32::r0
35
#define RRETHI vixl::aarch32::r1
36
#define RARG1 vixl::aarch32::r0
37
#define RARG2 vixl::aarch32::r1
38
#define RARG3 vixl::aarch32::r2
39
#define RSCRATCH vixl::aarch32::r12
40
#define RSTATE vixl::aarch32::r4
41
42
static bool armIsCallerSavedRegister(u32 id);
43
static s32 armGetPCDisplacement(const void* current, const void* target);
44
static bool armIsPCDisplacementInImmediateRange(s32 displacement);
45
static void armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr);
46
static void armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm);
47
static void armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline);
48
static void armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline);
49
static void armEmitCondBranch(vixl::aarch32::Assembler* armAsm, vixl::aarch32::Condition cond, const void* ptr);
50
static void armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr);
51
static void armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr,
52
const vixl::aarch32::Register& tempreg = RSCRATCH);
53
static u8* armGetJumpTrampoline(const void* target);
54
55
static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;
56
static std::unordered_map<const void*, u32> s_trampoline_targets;
57
static u8* s_trampoline_start_ptr = nullptr;
58
static u32 s_trampoline_used = 0;
59
60
namespace CPU {
61
62
using namespace vixl::aarch32;
63
64
static ARM32Recompiler s_instance;
65
Recompiler* g_compiler = &s_instance;
66
67
} // namespace CPU
68
69
bool armIsCallerSavedRegister(u32 id)
70
{
71
return ((id >= 0 && id <= 3) || // r0-r3
72
(id == 12 || id == 14)); // sp, pc
73
}
74
75
s32 armGetPCDisplacement(const void* current, const void* target)
76
{
77
Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));
78
Assert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));
79
return static_cast<s32>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)));
80
}
81
82
bool armIsPCDisplacementInImmediateRange(s32 displacement)
83
{
84
return (displacement >= -33554432 && displacement <= 33554428);
85
}
86
87
void armEmitMov(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& rd, u32 imm)
88
{
89
if (vixl::IsUintN(16, imm))
90
{
91
armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);
92
return;
93
}
94
95
armAsm->mov(vixl::aarch32::al, rd, imm & 0xffff);
96
armAsm->movt(vixl::aarch32::al, rd, imm >> 16);
97
}
98
99
void armMoveAddressToReg(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr)
100
{
101
armEmitMov(armAsm, reg, static_cast<u32>(reinterpret_cast<uintptr_t>(addr)));
102
}
103
104
void armEmitJmp(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)
105
{
106
const void* cur = armAsm->GetCursorAddress<const void*>();
107
s32 displacement = armGetPCDisplacement(cur, ptr);
108
bool use_bx = !armIsPCDisplacementInImmediateRange(displacement);
109
if (use_bx && !force_inline)
110
{
111
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
112
{
113
displacement = armGetPCDisplacement(cur, trampoline);
114
use_bx = !armIsPCDisplacementInImmediateRange(displacement);
115
}
116
}
117
118
if (use_bx)
119
{
120
armMoveAddressToReg(armAsm, RSCRATCH, ptr);
121
armAsm->bx(RSCRATCH);
122
}
123
else
124
{
125
vixl::aarch32::Label label(displacement + armAsm->GetCursorOffset());
126
armAsm->b(&label);
127
}
128
}
129
130
void armEmitCall(vixl::aarch32::Assembler* armAsm, const void* ptr, bool force_inline)
131
{
132
const void* cur = armAsm->GetCursorAddress<const void*>();
133
s32 displacement = armGetPCDisplacement(cur, ptr);
134
bool use_blx = !armIsPCDisplacementInImmediateRange(displacement);
135
if (use_blx && !force_inline)
136
{
137
if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)
138
{
139
displacement = armGetPCDisplacement(cur, trampoline);
140
use_blx = !armIsPCDisplacementInImmediateRange(displacement);
141
}
142
}
143
144
if (use_blx)
145
{
146
armMoveAddressToReg(armAsm, RSCRATCH, ptr);
147
armAsm->blx(RSCRATCH);
148
}
149
else
150
{
151
vixl::aarch32::Label label(displacement + armAsm->GetCursorOffset());
152
armAsm->bl(&label);
153
}
154
}
155
156
void armEmitCondBranch(vixl::aarch32::Assembler* armAsm, vixl::aarch32::Condition cond, const void* ptr)
157
{
158
const s32 displacement = armGetPCDisplacement(armAsm->GetCursorAddress<const void*>(), ptr);
159
if (!armIsPCDisplacementInImmediateRange(displacement))
160
{
161
armMoveAddressToReg(armAsm, RSCRATCH, ptr);
162
armAsm->blx(cond, RSCRATCH);
163
}
164
else
165
{
166
vixl::aarch32::Label label(displacement + armAsm->GetCursorOffset());
167
armAsm->b(cond, &label);
168
}
169
}
170
171
void armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, const void* addr)
172
{
173
armMoveAddressToReg(armAsm, reg, addr);
174
armAsm->ldr(reg, vixl::aarch32::MemOperand(reg));
175
}
176
177
[[maybe_unused]] void armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,
178
const void* addr, const vixl::aarch32::Register& tempreg)
179
{
180
armMoveAddressToReg(armAsm, tempreg, addr);
181
armAsm->str(reg, vixl::aarch32::MemOperand(tempreg));
182
}
183
184
void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
185
{
186
#ifdef ENABLE_HOST_DISASSEMBLY
187
vixl::aarch32::PrintDisassembler dis(std::cout, 0);
188
dis.SetCodeAddress(reinterpret_cast<uintptr_t>(start));
189
dis.DisassembleA32Buffer(static_cast<const u32*>(start), size);
190
#else
191
ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");
192
#endif
193
}
194
195
u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
196
{
197
return size / vixl::aarch32::kA32InstructionSizeInBytes;
198
}
199
200
u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
201
{
202
using namespace vixl::aarch32;
203
204
const s32 disp = armGetPCDisplacement(code, dst);
205
DebugAssert(armIsPCDisplacementInImmediateRange(disp));
206
207
// A32 jumps are silly.
208
{
209
Assembler emit(static_cast<vixl::byte*>(code), kA32InstructionSizeInBytes, A32);
210
Label label(disp);
211
emit.b(&label);
212
}
213
214
if (flush_icache)
215
MemMap::FlushInstructionCache(code, kA32InstructionSizeInBytes);
216
217
return kA32InstructionSizeInBytes;
218
}
219
220
u8* armGetJumpTrampoline(const void* target)
221
{
222
auto it = s_trampoline_targets.find(target);
223
if (it != s_trampoline_targets.end())
224
return s_trampoline_start_ptr + it->second;
225
226
// align to 16 bytes?
227
const u32 offset = s_trampoline_used; // Common::AlignUpPow2(s_trampoline_used, 16);
228
229
// 4 movs plus a jump
230
if (TRAMPOLINE_AREA_SIZE - offset < 20)
231
{
232
Panic("Ran out of space in constant pool");
233
return nullptr;
234
}
235
236
u8* start = s_trampoline_start_ptr + offset;
237
vixl::aarch32::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);
238
armMoveAddressToReg(&armAsm, RSCRATCH, target);
239
armAsm.bx(RSCRATCH);
240
241
const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());
242
DebugAssert(size < 20);
243
s_trampoline_targets.emplace(target, offset);
244
s_trampoline_used = offset + static_cast<u32>(size);
245
246
MemMap::FlushInstructionCache(start, size);
247
return start;
248
}
249
250
u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
251
{
252
using namespace vixl::aarch32;
253
254
Assembler actual_asm(static_cast<u8*>(code), code_size);
255
Assembler* armAsm = &actual_asm;
256
257
#ifdef VIXL_DEBUG
258
vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
259
#endif
260
261
Label dispatch;
262
Label run_events_and_dispatch;
263
264
g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();
265
{
266
// Need the CPU state for basically everything :-)
267
armMoveAddressToReg(armAsm, RSTATE, &g_state);
268
}
269
270
// check events then for frame done
271
{
272
Label skip_event_check;
273
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
274
armAsm->ldr(RARG2, PTR(&g_state.downcount));
275
armAsm->cmp(RARG1, RARG2);
276
armAsm->b(lt, &skip_event_check);
277
278
g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();
279
armAsm->bind(&run_events_and_dispatch);
280
armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);
281
282
armAsm->bind(&skip_event_check);
283
}
284
285
// TODO: align?
286
g_dispatcher = armAsm->GetCursorAddress<const void*>();
287
{
288
armAsm->bind(&dispatch);
289
290
// x9 <- s_fast_map[pc >> 16]
291
armAsm->ldr(RARG1, PTR(&g_state.pc));
292
armMoveAddressToReg(armAsm, RARG3, g_code_lut.data());
293
armAsm->lsr(RARG2, RARG1, 16);
294
armAsm->ubfx(RARG1, RARG1, 2, 14);
295
armAsm->ldr(RARG2, MemOperand(RARG3, RARG2, LSL, 2));
296
297
// blr(x9[pc * 2]) (fast_map[pc >> 2])
298
armAsm->ldr(RARG1, MemOperand(RARG2, RARG1, LSL, 2));
299
armAsm->bx(RARG1);
300
}
301
302
g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();
303
{
304
armAsm->ldr(RARG1, PTR(&g_state.pc));
305
armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);
306
armAsm->b(&dispatch);
307
}
308
309
g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();
310
{
311
armAsm->ldr(RARG1, PTR(&g_state.pc));
312
armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);
313
armAsm->b(&dispatch);
314
}
315
316
g_interpret_block = armAsm->GetCursorAddress<const void*>();
317
{
318
armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);
319
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
320
armAsm->ldr(RARG2, PTR(&g_state.downcount));
321
armAsm->cmp(RARG1, RARG2);
322
armAsm->b(ge, &run_events_and_dispatch);
323
armAsm->b(&dispatch);
324
}
325
326
armAsm->FinalizeCode();
327
328
s_trampoline_targets.clear();
329
s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();
330
s_trampoline_used = 0;
331
332
return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;
333
}
334
335
void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)
336
{
337
constexpr u8 padding_value = 0x00;
338
std::memset(dst, padding_value, size);
339
}
340
341
CPU::ARM32Recompiler::ARM32Recompiler() : m_emitter(A32), m_far_emitter(A32)
342
{
343
}
344
345
CPU::ARM32Recompiler::~ARM32Recompiler() = default;
346
347
const void* CPU::ARM32Recompiler::GetCurrentCodePointer()
348
{
349
return armAsm->GetCursorAddress<const void*>();
350
}
351
352
void CPU::ARM32Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
353
u32 far_code_space)
354
{
355
Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
356
357
// TODO: don't recreate this every time..
358
DebugAssert(!armAsm);
359
m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);
360
m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);
361
armAsm = &m_emitter;
362
363
#ifdef VIXL_DEBUG
364
m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(m_emitter.get(), code_buffer_space,
365
vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
366
m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(
367
m_far_emitter.get(), far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
368
#endif
369
370
// Need to wipe it out so it's correct when toggling fastmem.
371
m_host_regs = {};
372
373
const u32 membase_idx =
374
(CodeCache::IsUsingFastmem() && block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) ?
375
RMEMBASE.GetCode() :
376
NUM_HOST_REGS;
377
for (u32 i = 0; i < NUM_HOST_REGS; i++)
378
{
379
HostRegAlloc& ra = m_host_regs[i];
380
381
if (i == RARG1.GetCode() || i == RARG2.GetCode() || i == RARG3.GetCode() || i == RSCRATCH.GetCode() ||
382
i == RSTATE.GetCode() || i == membase_idx || i == sp.GetCode() || i == pc.GetCode())
383
{
384
continue;
385
}
386
387
ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
388
}
389
}
390
391
void CPU::ARM32Recompiler::SwitchToFarCode(bool emit_jump, vixl::aarch32::ConditionType cond)
392
{
393
DebugAssert(armAsm == &m_emitter);
394
if (emit_jump)
395
{
396
const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
397
if (armIsPCDisplacementInImmediateRange(disp))
398
{
399
Label ldisp(armAsm->GetCursorOffset() + disp);
400
armAsm->b(cond, &ldisp);
401
}
402
else if (cond != vixl::aarch32::al)
403
{
404
Label skip;
405
armAsm->b(Condition(cond).Negate(), &skip);
406
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
407
armAsm->bind(&skip);
408
}
409
else
410
{
411
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
412
}
413
}
414
armAsm = &m_far_emitter;
415
}
416
417
void CPU::ARM32Recompiler::SwitchToFarCodeIfBitSet(const vixl::aarch32::Register& reg, u32 bit)
418
{
419
armAsm->tst(reg, 1u << bit);
420
421
const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
422
if (armIsPCDisplacementInImmediateRange(disp))
423
{
424
Label ldisp(armAsm->GetCursorOffset() + disp);
425
armAsm->b(ne, &ldisp);
426
}
427
else
428
{
429
Label skip;
430
armAsm->b(eq, &skip);
431
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
432
armAsm->bind(&skip);
433
}
434
435
armAsm = &m_far_emitter;
436
}
437
438
void CPU::ARM32Recompiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch32::Register& reg, bool nonzero)
439
{
440
armAsm->cmp(reg, 0);
441
442
const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());
443
if (armIsPCDisplacementInImmediateRange(disp))
444
{
445
Label ldisp(armAsm->GetCursorOffset() + disp);
446
nonzero ? armAsm->b(ne, &ldisp) : armAsm->b(eq, &ldisp);
447
}
448
else
449
{
450
Label skip;
451
nonzero ? armAsm->b(eq, &skip) : armAsm->b(ne, &skip);
452
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
453
armAsm->bind(&skip);
454
}
455
456
armAsm = &m_far_emitter;
457
}
458
459
void CPU::ARM32Recompiler::SwitchToNearCode(bool emit_jump, vixl::aarch32::ConditionType cond)
460
{
461
DebugAssert(armAsm == &m_far_emitter);
462
if (emit_jump)
463
{
464
const s32 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());
465
if (armIsPCDisplacementInImmediateRange(disp))
466
{
467
Label ldisp(armAsm->GetCursorOffset() + disp);
468
armAsm->b(cond, &ldisp);
469
}
470
else if (cond != vixl::aarch32::al)
471
{
472
Label skip;
473
armAsm->b(Condition(cond).Negate(), &skip);
474
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
475
armAsm->bind(&skip);
476
}
477
else
478
{
479
armEmitJmp(armAsm, m_far_emitter.GetCursorAddress<const void*>(), true);
480
}
481
}
482
armAsm = &m_emitter;
483
}
484
485
void CPU::ARM32Recompiler::EmitMov(const vixl::aarch32::Register& dst, u32 val)
486
{
487
armEmitMov(armAsm, dst, val);
488
}
489
490
void CPU::ARM32Recompiler::EmitCall(const void* ptr, bool force_inline /*= false*/)
491
{
492
armEmitCall(armAsm, ptr, force_inline);
493
}
494
495
vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckAddSubConstant(s32 val)
496
{
497
if (ImmediateA32::IsImmediateA32(static_cast<u32>(val)))
498
return vixl::aarch32::Operand(static_cast<int32_t>(val));
499
500
EmitMov(RSCRATCH, static_cast<u32>(val));
501
return vixl::aarch32::Operand(RSCRATCH);
502
}
503
504
vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckAddSubConstant(u32 val)
505
{
506
return armCheckAddSubConstant(static_cast<s32>(val));
507
}
508
509
vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckCompareConstant(s32 val)
510
{
511
return armCheckAddSubConstant(val);
512
}
513
514
vixl::aarch32::Operand CPU::ARM32Recompiler::armCheckLogicalConstant(u32 val)
515
{
516
return armCheckAddSubConstant(val);
517
}
518
519
void CPU::ARM32Recompiler::BeginBlock()
520
{
521
Recompiler::BeginBlock();
522
}
523
524
void CPU::ARM32Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
525
{
526
// store it first to reduce code size, because we can offset
527
armMoveAddressToReg(armAsm, RARG1, ram_ptr);
528
armMoveAddressToReg(armAsm, RARG2, shadow_ptr);
529
530
u32 offset = 0;
531
Label block_changed;
532
533
#if 0
534
/* TODO: Vectorize
535
#include <arm_neon.h>
536
#include <stdint.h>
537
538
bool foo(const void* a, const void* b)
539
{
540
uint8x16_t v1 = vld1q_u8((const uint8_t*)a);
541
uint8x16_t v2 = vld1q_u8((const uint8_t*)b);
542
uint8x16_t v3 = vld1q_u8((const uint8_t*)a + 16);
543
uint8x16_t v4 = vld1q_u8((const uint8_t*)a + 16);
544
uint8x16_t r = vceqq_u8(v1, v2);
545
uint8x16_t r2 = vceqq_u8(v2, v3);
546
uint8x16_t r3 = vandq_u8(r, r2);
547
uint32x2_t rr = vpmin_u32(vget_low_u32(vreinterpretq_u32_u8(r3)), vget_high_u32(vreinterpretq_u32_u8(r3)));
548
if ((vget_lane_u32(rr, 0) & vget_lane_u32(rr, 1)) != 0xFFFFFFFFu)
549
return false;
550
else
551
return true;
552
}
553
*/
554
bool first = true;
555
556
while (size >= 16)
557
{
558
const VRegister vtmp = a32::v2.V4S();
559
const VRegister dst = first ? a32::v0.V4S() : a32::v1.V4S();
560
m_emit->ldr(dst, a32::MemOperand(RXARG1, offset));
561
m_emit->ldr(vtmp, a32::MemOperand(RXARG2, offset));
562
m_emit->cmeq(dst, dst, vtmp);
563
if (!first)
564
m_emit->and_(dst.V16B(), dst.V16B(), vtmp.V16B());
565
else
566
first = false;
567
568
offset += 16;
569
size -= 16;
570
}
571
572
if (!first)
573
{
574
// TODO: make sure this doesn't choke on ffffffff
575
armAsm->uminv(a32::s0, a32::v0.V4S());
576
armAsm->fcmp(a32::s0, 0.0);
577
armAsm->b(&block_changed, a32::eq);
578
}
579
#endif
580
581
while (size >= 4)
582
{
583
armAsm->ldr(RARG3, MemOperand(RARG1, offset));
584
armAsm->ldr(RSCRATCH, MemOperand(RARG2, offset));
585
armAsm->cmp(RARG3, RSCRATCH);
586
armAsm->b(ne, &block_changed);
587
offset += 4;
588
size -= 4;
589
}
590
591
DebugAssert(size == 0);
592
593
Label block_unchanged;
594
armAsm->b(&block_unchanged);
595
armAsm->bind(&block_changed);
596
armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);
597
armAsm->bind(&block_unchanged);
598
}
599
600
void CPU::ARM32Recompiler::GenerateICacheCheckAndUpdate()
601
{
602
if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))
603
{
604
if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
605
{
606
armEmitFarLoad(armAsm, RARG2, GetFetchMemoryAccessTimePtr());
607
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
608
armEmitMov(armAsm, RARG3, m_block->size);
609
armAsm->mul(RARG2, RARG2, RARG3);
610
armAsm->add(RARG1, RARG1, RARG2);
611
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
612
}
613
else
614
{
615
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
616
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));
617
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
618
}
619
}
620
else if (m_block->icache_line_count > 0)
621
{
622
VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
623
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
624
if (fill_ticks <= 0)
625
return;
626
627
const auto& ticks_reg = RARG1;
628
const auto& current_tag_reg = RARG2;
629
const auto& existing_tag_reg = RARG3;
630
const auto& fill_ticks_reg = r5;
631
632
armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
633
armEmitMov(armAsm, current_tag_reg, current_pc);
634
armEmitMov(armAsm, fill_ticks_reg, fill_ticks);
635
636
for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
637
{
638
const TickCount fill_ticks = GetICacheFillTicks(current_pc);
639
if (fill_ticks <= 0)
640
continue;
641
642
const u32 line = GetICacheLine(current_pc);
643
const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));
644
645
// Offsets must be <4K on ARM.
646
MemOperand line_addr = MemOperand(RSTATE, offset);
647
if (offset >= 4096)
648
{
649
armEmitMov(armAsm, RSCRATCH, offset);
650
line_addr = MemOperand(RSTATE, RSCRATCH);
651
}
652
653
Label cache_hit;
654
armAsm->ldr(existing_tag_reg, line_addr);
655
armAsm->str(current_tag_reg, line_addr);
656
armAsm->cmp(existing_tag_reg, current_tag_reg);
657
armAsm->add(ne, ticks_reg, ticks_reg, fill_ticks_reg);
658
659
if (i != (m_block->icache_line_count - 1))
660
armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));
661
}
662
663
armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));
664
}
665
}
666
667
void CPU::ARM32Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
668
s32 arg3reg /*= -1*/)
669
{
670
if (arg1reg >= 0 && arg1reg != static_cast<s32>(RARG1.GetCode()))
671
armAsm->mov(RARG1, Register(arg1reg));
672
if (arg2reg >= 0 && arg2reg != static_cast<s32>(RARG2.GetCode()))
673
armAsm->mov(RARG2, Register(arg2reg));
674
if (arg3reg >= 0 && arg3reg != static_cast<s32>(RARG3.GetCode()))
675
armAsm->mov(RARG3, Register(arg3reg));
676
EmitCall(func);
677
}
678
679
void CPU::ARM32Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
680
{
681
if (newpc.has_value())
682
{
683
if (m_dirty_pc || m_compiler_pc != newpc)
684
{
685
EmitMov(RSCRATCH, newpc.value());
686
armAsm->str(RSCRATCH, PTR(&g_state.pc));
687
}
688
}
689
m_dirty_pc = false;
690
691
// flush regs
692
Flush(FLUSH_END_BLOCK);
693
EndAndLinkBlock(newpc, do_event_test, false);
694
}
695
696
void CPU::ARM32Recompiler::EndBlockWithException(Exception excode)
697
{
698
// flush regs, but not pc, it's going to get overwritten
699
// flush cycles because of the GTE instruction stuff...
700
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
701
702
// TODO: flush load delay
703
704
EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
705
inst->cop.cop_n));
706
EmitMov(RARG2, m_current_instruction_pc);
707
if (excode != Exception::BP)
708
{
709
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
710
}
711
else
712
{
713
EmitMov(RARG3, inst->bits);
714
EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));
715
}
716
717
m_dirty_pc = false;
718
719
EndAndLinkBlock(std::nullopt, true, false);
720
}
721
722
void CPU::ARM32Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)
723
{
724
// event test
725
// pc should've been flushed
726
DebugAssert(!m_dirty_pc && !m_block_ended);
727
m_block_ended = true;
728
729
// TODO: try extracting this to a function
730
731
// save cycles for event test
732
const TickCount cycles = std::exchange(m_cycles, 0);
733
734
// pending_ticks += cycles
735
// if (pending_ticks >= downcount) { dispatch_event(); }
736
if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
737
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
738
if (do_event_test)
739
armAsm->ldr(RARG2, PTR(&g_state.downcount));
740
if (cycles > 0)
741
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(cycles));
742
if (m_gte_done_cycle > cycles)
743
{
744
armAsm->add(RARG2, RARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));
745
armAsm->str(RARG2, PTR(&g_state.gte_completion_tick));
746
}
747
if (do_event_test)
748
armAsm->cmp(RARG1, RARG2);
749
if (cycles > 0)
750
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
751
if (do_event_test)
752
armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);
753
754
// jump to dispatcher or next block
755
if (force_run_events)
756
{
757
armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);
758
}
759
else if (!newpc.has_value())
760
{
761
armEmitJmp(armAsm, CodeCache::g_dispatcher, false);
762
}
763
else
764
{
765
const void* target = (newpc.value() == m_block->pc) ?
766
CodeCache::CreateSelfBlockLink(m_block, armAsm->GetCursorAddress<void*>(),
767
armAsm->GetBuffer()->GetStartAddress<const void*>()) :
768
CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());
769
armEmitJmp(armAsm, target, true);
770
}
771
}
772
773
const void* CPU::ARM32Recompiler::EndCompile(u32* code_size, u32* far_code_size)
774
{
775
#ifdef VIXL_DEBUG
776
m_emitter_check.reset();
777
m_far_emitter_check.reset();
778
#endif
779
780
m_emitter.FinalizeCode();
781
m_far_emitter.FinalizeCode();
782
783
u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();
784
*code_size = static_cast<u32>(m_emitter.GetCursorOffset());
785
*far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());
786
armAsm = nullptr;
787
return code;
788
}
789
790
const char* CPU::ARM32Recompiler::GetHostRegName(u32 reg) const
791
{
792
static constexpr std::array<const char*, 32> reg64_names = {
793
{"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
794
"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};
795
return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
796
}
797
798
void CPU::ARM32Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)
799
{
800
EmitMov(Register(reg), val);
801
}
802
803
void CPU::ARM32Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
804
{
805
armAsm->ldr(Register(reg), PTR(ptr));
806
}
807
808
void CPU::ARM32Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
809
{
810
armAsm->str(Register(reg), PTR(ptr));
811
}
812
813
void CPU::ARM32Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
814
{
815
EmitMov(RSCRATCH, val);
816
armAsm->str(RSCRATCH, PTR(ptr));
817
}
818
819
void CPU::ARM32Recompiler::CopyHostReg(u32 dst, u32 src)
820
{
821
if (src != dst)
822
armAsm->mov(Register(dst), Register(src));
823
}
824
825
void CPU::ARM32Recompiler::AssertRegOrConstS(CompileFlags cf) const
826
{
827
DebugAssert(cf.valid_host_s || cf.const_s);
828
}
829
830
void CPU::ARM32Recompiler::AssertRegOrConstT(CompileFlags cf) const
831
{
832
DebugAssert(cf.valid_host_t || cf.const_t);
833
}
834
835
vixl::aarch32::MemOperand CPU::ARM32Recompiler::MipsPtr(Reg r) const
836
{
837
DebugAssert(r < Reg::count);
838
return PTR(&g_state.regs.r[static_cast<u32>(r)]);
839
}
840
841
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegD(CompileFlags cf) const
842
{
843
DebugAssert(cf.valid_host_d);
844
return Register(cf.host_d);
845
}
846
847
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegS(CompileFlags cf) const
848
{
849
DebugAssert(cf.valid_host_s);
850
return Register(cf.host_s);
851
}
852
853
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegT(CompileFlags cf) const
854
{
855
DebugAssert(cf.valid_host_t);
856
return Register(cf.host_t);
857
}
858
859
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegLO(CompileFlags cf) const
860
{
861
DebugAssert(cf.valid_host_lo);
862
return Register(cf.host_lo);
863
}
864
865
vixl::aarch32::Register CPU::ARM32Recompiler::CFGetRegHI(CompileFlags cf) const
866
{
867
DebugAssert(cf.valid_host_hi);
868
return Register(cf.host_hi);
869
}
870
871
vixl::aarch32::Register CPU::ARM32Recompiler::GetMembaseReg()
872
{
873
const u32 code = RMEMBASE.GetCode();
874
if (!IsHostRegAllocated(code))
875
{
876
// Leave usable unset, so we don't try to allocate it later.
877
m_host_regs[code].type = HR_TYPE_MEMBASE;
878
m_host_regs[code].flags = HR_ALLOCATED;
879
armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
880
}
881
882
return RMEMBASE;
883
}
884
885
void CPU::ARM32Recompiler::MoveSToReg(const vixl::aarch32::Register& dst, CompileFlags cf)
886
{
887
if (cf.valid_host_s)
888
{
889
if (cf.host_s != dst.GetCode())
890
armAsm->mov(dst, Register(cf.host_s));
891
}
892
else if (cf.const_s)
893
{
894
const u32 cv = GetConstantRegU32(cf.MipsS());
895
EmitMov(dst, cv);
896
}
897
else
898
{
899
WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));
900
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));
901
}
902
}
903
904
void CPU::ARM32Recompiler::MoveTToReg(const vixl::aarch32::Register& dst, CompileFlags cf)
905
{
906
if (cf.valid_host_t)
907
{
908
if (cf.host_t != dst.GetCode())
909
armAsm->mov(dst, Register(cf.host_t));
910
}
911
else if (cf.const_t)
912
{
913
const u32 cv = GetConstantRegU32(cf.MipsT());
914
EmitMov(dst, cv);
915
}
916
else
917
{
918
WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));
919
armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));
920
}
921
}
922
923
void CPU::ARM32Recompiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg, bool ignore_load_delays)
924
{
925
DebugAssert(reg < Reg::count);
926
if (ignore_load_delays && m_load_delay_register == reg)
927
{
928
if (m_load_delay_value_register == NUM_HOST_REGS)
929
armAsm->ldr(dst, PTR(&g_state.load_delay_value));
930
else
931
armAsm->mov(dst, Register(m_load_delay_value_register));
932
}
933
else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
934
{
935
armAsm->mov(dst, Register(hreg.value()));
936
}
937
else if (HasConstantReg(reg))
938
{
939
EmitMov(dst, GetConstantRegU32(reg));
940
}
941
else
942
{
943
armAsm->ldr(dst, MipsPtr(reg));
944
}
945
}
946
947
void CPU::ARM32Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
948
Reg arg3reg /* = Reg::count */)
949
{
950
DebugAssert(g_settings.gpu_pgxp_enable);
951
952
Flush(FLUSH_FOR_C_CALL);
953
954
if (arg2reg != Reg::count)
955
MoveMIPSRegToReg(RARG2, arg2reg);
956
if (arg3reg != Reg::count)
957
MoveMIPSRegToReg(RARG3, arg3reg);
958
959
EmitMov(RARG1, arg1val);
960
EmitCall(func);
961
}
962
963
void CPU::ARM32Recompiler::Flush(u32 flags)
964
{
965
Recompiler::Flush(flags);
966
967
if (flags & FLUSH_PC && m_dirty_pc)
968
{
969
StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
970
m_dirty_pc = false;
971
}
972
973
if (flags & FLUSH_INSTRUCTION_BITS)
974
{
975
// This sucks, but it's only used for fallbacks.
976
EmitMov(RARG1, inst->bits);
977
EmitMov(RARG2, m_current_instruction_pc);
978
EmitMov(RARG3, m_current_instruction_branch_delay_slot);
979
armAsm->str(RARG1, PTR(&g_state.current_instruction.bits));
980
armAsm->str(RARG2, PTR(&g_state.current_instruction_pc));
981
armAsm->strb(RARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));
982
}
983
984
if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
985
{
986
// This sucks :(
987
// TODO: make it a function?
988
armAsm->ldrb(RARG1, PTR(&g_state.load_delay_reg));
989
armAsm->ldr(RARG2, PTR(&g_state.load_delay_value));
990
EmitMov(RSCRATCH, OFFSETOF(CPU::State, regs.r[0]));
991
armAsm->add(RARG1, RSCRATCH, vixl::aarch32::Operand(RARG1, LSL, 2));
992
armAsm->str(RARG2, MemOperand(RSTATE, RARG1));
993
EmitMov(RSCRATCH, static_cast<u8>(Reg::count));
994
armAsm->strb(RSCRATCH, PTR(&g_state.load_delay_reg));
995
m_load_delay_dirty = false;
996
}
997
998
if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
999
{
1000
if (m_load_delay_value_register != NUM_HOST_REGS)
1001
FreeHostReg(m_load_delay_value_register);
1002
1003
EmitMov(RSCRATCH, static_cast<u8>(m_load_delay_register));
1004
armAsm->strb(RSCRATCH, PTR(&g_state.load_delay_reg));
1005
m_load_delay_register = Reg::count;
1006
m_load_delay_dirty = true;
1007
}
1008
1009
if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
1010
{
1011
// May as well flush cycles while we're here.
1012
// GTE spanning blocks is very rare, we _could_ disable this for speed.
1013
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
1014
armAsm->ldr(RARG2, PTR(&g_state.gte_completion_tick));
1015
if (m_cycles > 0)
1016
{
1017
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));
1018
m_cycles = 0;
1019
}
1020
armAsm->cmp(RARG2, RARG1);
1021
armAsm->mov(hs, RARG1, RARG2);
1022
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
1023
m_dirty_gte_done_cycle = false;
1024
}
1025
1026
if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
1027
{
1028
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
1029
1030
// update cycles at the same time
1031
if (flags & FLUSH_CYCLES && m_cycles > 0)
1032
{
1033
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));
1034
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
1035
m_gte_done_cycle -= m_cycles;
1036
m_cycles = 0;
1037
}
1038
1039
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_gte_done_cycle));
1040
armAsm->str(RARG1, PTR(&g_state.gte_completion_tick));
1041
m_gte_done_cycle = 0;
1042
m_dirty_gte_done_cycle = true;
1043
}
1044
1045
if (flags & FLUSH_CYCLES && m_cycles > 0)
1046
{
1047
armAsm->ldr(RARG1, PTR(&g_state.pending_ticks));
1048
armAsm->add(RARG1, RARG1, armCheckAddSubConstant(m_cycles));
1049
armAsm->str(RARG1, PTR(&g_state.pending_ticks));
1050
m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
1051
m_cycles = 0;
1052
}
1053
}
1054
1055
void CPU::ARM32Recompiler::Compile_Fallback()
1056
{
1057
WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,
1058
inst->bits);
1059
1060
Flush(FLUSH_FOR_INTERPRETER);
1061
1062
EmitCall(reinterpret_cast<const void*>(&CPU::RecompilerThunks::InterpretInstruction));
1063
1064
// TODO: make me less garbage
1065
// TODO: this is wrong, it flushes the load delay on the same cycle when we return.
1066
// but nothing should be going through here..
1067
Label no_load_delay;
1068
armAsm->ldrb(RARG1, PTR(&g_state.next_load_delay_reg));
1069
armAsm->cmp(RARG1, static_cast<u8>(Reg::count));
1070
armAsm->b(eq, &no_load_delay);
1071
armAsm->ldr(RARG2, PTR(&g_state.next_load_delay_value));
1072
armAsm->strb(RARG1, PTR(&g_state.load_delay_reg));
1073
armAsm->str(RARG2, PTR(&g_state.load_delay_value));
1074
EmitMov(RARG1, static_cast<u32>(Reg::count));
1075
armAsm->strb(RARG1, PTR(&g_state.next_load_delay_reg));
1076
armAsm->bind(&no_load_delay);
1077
1078
m_load_delay_dirty = EMULATE_LOAD_DELAYS;
1079
}
1080
1081
void CPU::ARM32Recompiler::CheckBranchTarget(const vixl::aarch32::Register& pcreg)
1082
{
1083
if (!g_settings.cpu_recompiler_memory_exceptions)
1084
return;
1085
1086
armAsm->tst(pcreg, armCheckLogicalConstant(0x3));
1087
SwitchToFarCode(true, ne);
1088
1089
BackupHostState();
1090
EndBlockWithException(Exception::AdEL);
1091
1092
RestoreHostState();
1093
SwitchToNearCode(false);
1094
}
1095
1096
void CPU::ARM32Recompiler::Compile_jr(CompileFlags cf)
1097
{
1098
const Register pcreg = CFGetRegS(cf);
1099
CheckBranchTarget(pcreg);
1100
1101
armAsm->str(pcreg, PTR(&g_state.pc));
1102
1103
CompileBranchDelaySlot(false);
1104
EndBlock(std::nullopt, true);
1105
}
1106
1107
void CPU::ARM32Recompiler::Compile_jalr(CompileFlags cf)
1108
{
1109
const Register pcreg = CFGetRegS(cf);
1110
if (MipsD() != Reg::zero)
1111
SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
1112
1113
CheckBranchTarget(pcreg);
1114
armAsm->str(pcreg, PTR(&g_state.pc));
1115
1116
CompileBranchDelaySlot(false);
1117
EndBlock(std::nullopt, true);
1118
}
1119
1120
void CPU::ARM32Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
1121
{
1122
AssertRegOrConstS(cf);
1123
1124
const u32 taken_pc = GetConditionalBranchTarget(cf);
1125
1126
Flush(FLUSH_FOR_BRANCH);
1127
1128
DebugAssert(cf.valid_host_s);
1129
1130
// MipsT() here should equal zero for zero branches.
1131
DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
1132
1133
Label taken;
1134
const Register rs = CFGetRegS(cf);
1135
switch (cond)
1136
{
1137
case BranchCondition::Equal:
1138
case BranchCondition::NotEqual:
1139
{
1140
AssertRegOrConstT(cf);
1141
if (cf.valid_host_t)
1142
armAsm->cmp(rs, CFGetRegT(cf));
1143
else if (cf.const_t)
1144
armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));
1145
1146
armAsm->b((cond == BranchCondition::Equal) ? eq : ne, &taken);
1147
}
1148
break;
1149
1150
case BranchCondition::GreaterThanZero:
1151
{
1152
armAsm->cmp(rs, 0);
1153
armAsm->b(gt, &taken);
1154
}
1155
break;
1156
1157
case BranchCondition::GreaterEqualZero:
1158
{
1159
armAsm->cmp(rs, 0);
1160
armAsm->b(ge, &taken);
1161
}
1162
break;
1163
1164
case BranchCondition::LessThanZero:
1165
{
1166
armAsm->cmp(rs, 0);
1167
armAsm->b(lt, &taken);
1168
}
1169
break;
1170
1171
case BranchCondition::LessEqualZero:
1172
{
1173
armAsm->cmp(rs, 0);
1174
armAsm->b(le, &taken);
1175
}
1176
break;
1177
}
1178
1179
BackupHostState();
1180
if (!cf.delay_slot_swapped)
1181
CompileBranchDelaySlot();
1182
1183
EndBlock(m_compiler_pc, true);
1184
1185
armAsm->bind(&taken);
1186
1187
RestoreHostState();
1188
if (!cf.delay_slot_swapped)
1189
CompileBranchDelaySlot();
1190
1191
EndBlock(taken_pc, true);
1192
}
1193
1194
void CPU::ARM32Recompiler::Compile_addi(CompileFlags cf, bool overflow)
1195
{
1196
const Register rs = CFGetRegS(cf);
1197
const Register rt = CFGetRegT(cf);
1198
if (const u32 imm = inst->i.imm_sext32(); imm != 0)
1199
{
1200
if (!overflow)
1201
{
1202
armAsm->add(rt, rs, armCheckAddSubConstant(imm));
1203
}
1204
else
1205
{
1206
armAsm->adds(rt, rs, armCheckAddSubConstant(imm));
1207
TestOverflow(rt);
1208
}
1209
}
1210
else if (rt.GetCode() != rs.GetCode())
1211
{
1212
armAsm->mov(rt, rs);
1213
}
1214
}
1215
1216
void CPU::ARM32Recompiler::Compile_addi(CompileFlags cf)
1217
{
1218
Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
1219
}
1220
1221
void CPU::ARM32Recompiler::Compile_addiu(CompileFlags cf)
1222
{
1223
Compile_addi(cf, false);
1224
}
1225
1226
void CPU::ARM32Recompiler::Compile_slti(CompileFlags cf)
1227
{
1228
Compile_slti(cf, true);
1229
}
1230
1231
void CPU::ARM32Recompiler::Compile_sltiu(CompileFlags cf)
1232
{
1233
Compile_slti(cf, false);
1234
}
1235
1236
void CPU::ARM32Recompiler::Compile_slti(CompileFlags cf, bool sign)
1237
{
1238
const Register rs = CFGetRegS(cf);
1239
const Register rt = CFGetRegT(cf);
1240
armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));
1241
armAsm->mov(sign ? ge : hs, rt, 0);
1242
armAsm->mov(sign ? lt : lo, rt, 1);
1243
}
1244
1245
void CPU::ARM32Recompiler::Compile_andi(CompileFlags cf)
1246
{
1247
const Register rt = CFGetRegT(cf);
1248
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1249
armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));
1250
else
1251
EmitMov(rt, 0);
1252
}
1253
1254
void CPU::ARM32Recompiler::Compile_ori(CompileFlags cf)
1255
{
1256
const Register rt = CFGetRegT(cf);
1257
const Register rs = CFGetRegS(cf);
1258
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1259
armAsm->orr(rt, rs, armCheckLogicalConstant(imm));
1260
else if (rt.GetCode() != rs.GetCode())
1261
armAsm->mov(rt, rs);
1262
}
1263
1264
void CPU::ARM32Recompiler::Compile_xori(CompileFlags cf)
1265
{
1266
const Register rt = CFGetRegT(cf);
1267
const Register rs = CFGetRegS(cf);
1268
if (const u32 imm = inst->i.imm_zext32(); imm != 0)
1269
armAsm->eor(rt, rs, armCheckLogicalConstant(imm));
1270
else if (rt.GetCode() != rs.GetCode())
1271
armAsm->mov(rt, rs);
1272
}
1273
1274
void CPU::ARM32Recompiler::Compile_shift(CompileFlags cf,
1275
void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,
1276
vixl::aarch32::Register, const Operand&))
1277
{
1278
const Register rd = CFGetRegD(cf);
1279
const Register rt = CFGetRegT(cf);
1280
if (inst->r.shamt > 0)
1281
(armAsm->*op)(rd, rt, inst->r.shamt.GetValue());
1282
else if (rd.GetCode() != rt.GetCode())
1283
armAsm->mov(rd, rt);
1284
}
1285
1286
void CPU::ARM32Recompiler::Compile_sll(CompileFlags cf)
1287
{
1288
Compile_shift(cf, &Assembler::lsl);
1289
}
1290
1291
void CPU::ARM32Recompiler::Compile_srl(CompileFlags cf)
1292
{
1293
Compile_shift(cf, &Assembler::lsr);
1294
}
1295
1296
void CPU::ARM32Recompiler::Compile_sra(CompileFlags cf)
1297
{
1298
Compile_shift(cf, &Assembler::asr);
1299
}
1300
1301
void CPU::ARM32Recompiler::Compile_variable_shift(CompileFlags cf,
1302
void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,
1303
vixl::aarch32::Register,
1304
const Operand&))
1305
{
1306
const Register rd = CFGetRegD(cf);
1307
1308
AssertRegOrConstS(cf);
1309
AssertRegOrConstT(cf);
1310
1311
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1312
if (!cf.valid_host_t)
1313
MoveTToReg(rt, cf);
1314
1315
if (cf.const_s)
1316
{
1317
if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
1318
(armAsm->*op)(rd, rt, shift & 0x1Fu);
1319
else if (rd.GetCode() != rt.GetCode())
1320
armAsm->mov(rd, rt);
1321
}
1322
else
1323
{
1324
armAsm->and_(RSCRATCH, CFGetRegS(cf), 0x1Fu);
1325
(armAsm->*op)(rd, rt, RSCRATCH);
1326
}
1327
}
1328
1329
void CPU::ARM32Recompiler::Compile_sllv(CompileFlags cf)
1330
{
1331
Compile_variable_shift(cf, &Assembler::lsl);
1332
}
1333
1334
void CPU::ARM32Recompiler::Compile_srlv(CompileFlags cf)
1335
{
1336
Compile_variable_shift(cf, &Assembler::lsr);
1337
}
1338
1339
void CPU::ARM32Recompiler::Compile_srav(CompileFlags cf)
1340
{
1341
Compile_variable_shift(cf, &Assembler::asr);
1342
}
1343
1344
void CPU::ARM32Recompiler::Compile_mult(CompileFlags cf, bool sign)
1345
{
1346
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
1347
if (!cf.valid_host_s)
1348
MoveSToReg(rs, cf);
1349
1350
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1351
if (!cf.valid_host_t)
1352
MoveTToReg(rt, cf);
1353
1354
// TODO: if lo/hi gets killed, we can use a 32-bit multiply
1355
const Register lo = CFGetRegLO(cf);
1356
const Register hi = CFGetRegHI(cf);
1357
1358
(sign) ? armAsm->smull(lo, hi, rs, rt) : armAsm->umull(lo, hi, rs, rt);
1359
}
1360
1361
void CPU::ARM32Recompiler::Compile_mult(CompileFlags cf)
1362
{
1363
Compile_mult(cf, true);
1364
}
1365
1366
void CPU::ARM32Recompiler::Compile_multu(CompileFlags cf)
1367
{
1368
Compile_mult(cf, false);
1369
}
1370
1371
void CPU::ARM32Recompiler::Compile_div(CompileFlags cf)
1372
{
1373
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
1374
if (!cf.valid_host_s)
1375
MoveSToReg(rs, cf);
1376
1377
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1378
if (!cf.valid_host_t)
1379
MoveTToReg(rt, cf);
1380
1381
const Register rlo = CFGetRegLO(cf);
1382
const Register rhi = CFGetRegHI(cf);
1383
1384
// TODO: This could be slightly more optimal
1385
Label done;
1386
Label not_divide_by_zero;
1387
armAsm->cmp(rt, 0);
1388
armAsm->b(ne, &not_divide_by_zero);
1389
armAsm->mov(rhi, rs); // hi = num
1390
EmitMov(rlo, 1);
1391
EmitMov(RSCRATCH, static_cast<u32>(-1));
1392
armAsm->cmp(rs, 0);
1393
armAsm->mov(ge, rlo, RSCRATCH); // lo = s >= 0 ? -1 : 1
1394
armAsm->b(&done);
1395
1396
armAsm->bind(&not_divide_by_zero);
1397
Label not_unrepresentable;
1398
armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));
1399
armAsm->b(ne, &not_unrepresentable);
1400
armAsm->cmp(rt, armCheckCompareConstant(-1));
1401
armAsm->b(ne, &not_unrepresentable);
1402
1403
EmitMov(rlo, 0x80000000u);
1404
EmitMov(rhi, 0);
1405
armAsm->b(&done);
1406
1407
armAsm->bind(&not_unrepresentable);
1408
1409
armAsm->sdiv(rlo, rs, rt);
1410
1411
// TODO: skip when hi is dead
1412
armAsm->mls(rhi, rlo, rt, rs);
1413
1414
armAsm->bind(&done);
1415
}
1416
1417
void CPU::ARM32Recompiler::Compile_divu(CompileFlags cf)
1418
{
1419
const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
1420
if (!cf.valid_host_s)
1421
MoveSToReg(rs, cf);
1422
1423
const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
1424
if (!cf.valid_host_t)
1425
MoveTToReg(rt, cf);
1426
1427
const Register rlo = CFGetRegLO(cf);
1428
const Register rhi = CFGetRegHI(cf);
1429
1430
Label done;
1431
Label not_divide_by_zero;
1432
armAsm->cmp(rt, 0);
1433
armAsm->b(ne, &not_divide_by_zero);
1434
EmitMov(rlo, static_cast<u32>(-1));
1435
armAsm->mov(rhi, rs);
1436
armAsm->b(&done);
1437
1438
armAsm->bind(&not_divide_by_zero);
1439
1440
armAsm->udiv(rlo, rs, rt);
1441
1442
// TODO: skip when hi is dead
1443
armAsm->mls(rhi, rlo, rt, rs);
1444
1445
armAsm->bind(&done);
1446
}
1447
1448
void CPU::ARM32Recompiler::TestOverflow(const vixl::aarch32::Register& result)
1449
{
1450
SwitchToFarCode(true, vs);
1451
1452
BackupHostState();
1453
1454
// toss the result
1455
ClearHostReg(result.GetCode());
1456
1457
EndBlockWithException(Exception::Ov);
1458
1459
RestoreHostState();
1460
1461
SwitchToNearCode(false);
1462
}
1463
1464
void CPU::ARM32Recompiler::Compile_dst_op(CompileFlags cf,
1465
void (vixl::aarch32::Assembler::*op)(vixl::aarch32::Register,
1466
vixl::aarch32::Register, const Operand&),
1467
bool commutative, bool logical, bool overflow)
1468
{
1469
AssertRegOrConstS(cf);
1470
AssertRegOrConstT(cf);
1471
1472
const Register rd = CFGetRegD(cf);
1473
if (cf.valid_host_s && cf.valid_host_t)
1474
{
1475
(armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
1476
}
1477
else if (commutative && (cf.const_s || cf.const_t))
1478
{
1479
const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
1480
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1481
{
1482
(armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1483
}
1484
else
1485
{
1486
if (rd.GetCode() != src.GetCode())
1487
armAsm->mov(rd, src);
1488
overflow = false;
1489
}
1490
}
1491
else if (cf.const_s)
1492
{
1493
EmitMov(RSCRATCH, GetConstantRegU32(cf.MipsS()));
1494
(armAsm->*op)(rd, RSCRATCH, CFGetRegT(cf));
1495
}
1496
else if (cf.const_t)
1497
{
1498
const Register rs = CFGetRegS(cf);
1499
if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
1500
{
1501
(armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
1502
}
1503
else
1504
{
1505
if (rd.GetCode() != rs.GetCode())
1506
armAsm->mov(rd, rs);
1507
overflow = false;
1508
}
1509
}
1510
1511
if (overflow)
1512
TestOverflow(rd);
1513
}
1514
1515
void CPU::ARM32Recompiler::Compile_add(CompileFlags cf)
1516
{
1517
if (g_settings.cpu_recompiler_memory_exceptions)
1518
Compile_dst_op(cf, &Assembler::adds, true, false, true);
1519
else
1520
Compile_dst_op(cf, &Assembler::add, true, false, false);
1521
}
1522
1523
void CPU::ARM32Recompiler::Compile_addu(CompileFlags cf)
1524
{
1525
Compile_dst_op(cf, &Assembler::add, true, false, false);
1526
}
1527
1528
void CPU::ARM32Recompiler::Compile_sub(CompileFlags cf)
1529
{
1530
if (g_settings.cpu_recompiler_memory_exceptions)
1531
Compile_dst_op(cf, &Assembler::subs, false, false, true);
1532
else
1533
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1534
}
1535
1536
void CPU::ARM32Recompiler::Compile_subu(CompileFlags cf)
1537
{
1538
Compile_dst_op(cf, &Assembler::sub, false, false, false);
1539
}
1540
1541
void CPU::ARM32Recompiler::Compile_and(CompileFlags cf)
1542
{
1543
AssertRegOrConstS(cf);
1544
AssertRegOrConstT(cf);
1545
1546
// special cases - and with self -> self, and with 0 -> 0
1547
const Register regd = CFGetRegD(cf);
1548
if (cf.MipsS() == cf.MipsT())
1549
{
1550
armAsm->mov(regd, CFGetRegS(cf));
1551
return;
1552
}
1553
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1554
{
1555
EmitMov(regd, 0);
1556
return;
1557
}
1558
1559
Compile_dst_op(cf, &Assembler::and_, true, true, false);
1560
}
1561
1562
void CPU::ARM32Recompiler::Compile_or(CompileFlags cf)
1563
{
1564
AssertRegOrConstS(cf);
1565
AssertRegOrConstT(cf);
1566
1567
// or/nor with 0 -> no effect
1568
const Register regd = CFGetRegD(cf);
1569
if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
1570
{
1571
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1572
return;
1573
}
1574
1575
Compile_dst_op(cf, &Assembler::orr, true, true, false);
1576
}
1577
1578
void CPU::ARM32Recompiler::Compile_xor(CompileFlags cf)
1579
{
1580
AssertRegOrConstS(cf);
1581
AssertRegOrConstT(cf);
1582
1583
const Register regd = CFGetRegD(cf);
1584
if (cf.MipsS() == cf.MipsT())
1585
{
1586
// xor with self -> zero
1587
EmitMov(regd, 0);
1588
return;
1589
}
1590
else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
1591
{
1592
// xor with zero -> no effect
1593
cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
1594
return;
1595
}
1596
1597
Compile_dst_op(cf, &Assembler::eor, true, true, false);
1598
}
1599
1600
void CPU::ARM32Recompiler::Compile_nor(CompileFlags cf)
1601
{
1602
Compile_or(cf);
1603
armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));
1604
}
1605
1606
void CPU::ARM32Recompiler::Compile_slt(CompileFlags cf)
1607
{
1608
Compile_slt(cf, true);
1609
}
1610
1611
void CPU::ARM32Recompiler::Compile_sltu(CompileFlags cf)
1612
{
1613
Compile_slt(cf, false);
1614
}
1615
1616
void CPU::ARM32Recompiler::Compile_slt(CompileFlags cf, bool sign)
1617
{
1618
AssertRegOrConstS(cf);
1619
AssertRegOrConstT(cf);
1620
1621
// TODO: swap and reverse op for constants
1622
if (cf.const_s)
1623
{
1624
EmitMov(RSCRATCH, GetConstantRegS32(cf.MipsS()));
1625
armAsm->cmp(RSCRATCH, CFGetRegT(cf));
1626
}
1627
else if (cf.const_t)
1628
{
1629
armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));
1630
}
1631
else
1632
{
1633
armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));
1634
}
1635
1636
const Register rd = CFGetRegD(cf);
1637
armAsm->mov(sign ? ge : cs, rd, 0);
1638
armAsm->mov(sign ? lt : lo, rd, 1);
1639
}
1640
1641
vixl::aarch32::Register
1642
CPU::ARM32Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,
1643
const std::optional<const vixl::aarch32::Register>& reg)
1644
{
1645
const u32 imm = inst->i.imm_sext32();
1646
if (cf.valid_host_s && imm == 0 && !reg.has_value())
1647
return CFGetRegS(cf);
1648
1649
const Register dst = reg.has_value() ? reg.value() : RARG1;
1650
if (address.has_value())
1651
{
1652
EmitMov(dst, address.value());
1653
}
1654
else if (imm == 0)
1655
{
1656
if (cf.valid_host_s)
1657
{
1658
if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())
1659
armAsm->mov(dst, CFGetRegS(cf));
1660
}
1661
else
1662
{
1663
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1664
}
1665
}
1666
else
1667
{
1668
if (cf.valid_host_s)
1669
{
1670
armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1671
}
1672
else
1673
{
1674
armAsm->ldr(dst, MipsPtr(cf.MipsS()));
1675
armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
1676
}
1677
}
1678
1679
return dst;
1680
}
1681
1682
template<typename RegAllocFn>
1683
vixl::aarch32::Register CPU::ARM32Recompiler::GenerateLoad(const vixl::aarch32::Register& addr_reg,
1684
MemoryAccessSize size, bool sign, bool use_fastmem,
1685
const RegAllocFn& dst_reg_alloc)
1686
{
1687
if (use_fastmem)
1688
{
1689
DebugAssert(g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT);
1690
m_cycles += Bus::RAM_READ_TICKS;
1691
1692
const Register dst = dst_reg_alloc();
1693
const Register membase = GetMembaseReg();
1694
DebugAssert(addr_reg.GetCode() != RARG3.GetCode());
1695
armAsm->lsr(RARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1696
armAsm->ldr(RARG3, MemOperand(membase, RARG3, LSL, 2));
1697
1698
const MemOperand mem = MemOperand(RARG3, addr_reg);
1699
u8* start = armAsm->GetCursorAddress<u8*>();
1700
switch (size)
1701
{
1702
case MemoryAccessSize::Byte:
1703
sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);
1704
break;
1705
1706
case MemoryAccessSize::HalfWord:
1707
sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);
1708
break;
1709
1710
case MemoryAccessSize::Word:
1711
armAsm->ldr(dst, mem);
1712
break;
1713
}
1714
1715
AddLoadStoreInfo(start, kA32InstructionSizeInBytes, addr_reg.GetCode(), dst.GetCode(), size, sign, true);
1716
return dst;
1717
}
1718
1719
if (addr_reg.GetCode() != RARG1.GetCode())
1720
armAsm->mov(RARG1, addr_reg);
1721
1722
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1723
switch (size)
1724
{
1725
case MemoryAccessSize::Byte:
1726
{
1727
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryByte) :
1728
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte));
1729
}
1730
break;
1731
case MemoryAccessSize::HalfWord:
1732
{
1733
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryHalfWord) :
1734
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord));
1735
}
1736
break;
1737
case MemoryAccessSize::Word:
1738
{
1739
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::ReadMemoryWord) :
1740
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord));
1741
}
1742
break;
1743
}
1744
1745
// TODO: turn this into an asm function instead
1746
if (checked)
1747
{
1748
SwitchToFarCodeIfBitSet(RRETHI, 31);
1749
BackupHostState();
1750
1751
// Need to stash this in a temp because of the flush.
1752
const Register temp = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
1753
armAsm->rsb(temp, RRETHI, 0);
1754
armAsm->lsl(temp, temp, 2);
1755
1756
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1757
1758
// cause_bits = (-result << 2) | BD | cop_n
1759
armAsm->orr(RARG1, temp,
1760
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
1761
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
1762
EmitMov(RARG2, m_current_instruction_pc);
1763
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1764
FreeHostReg(temp.GetCode());
1765
EndBlock(std::nullopt, true);
1766
1767
RestoreHostState();
1768
SwitchToNearCode(false);
1769
}
1770
1771
const Register dst_reg = dst_reg_alloc();
1772
switch (size)
1773
{
1774
case MemoryAccessSize::Byte:
1775
{
1776
sign ? armAsm->sxtb(dst_reg, RRET) : armAsm->uxtb(dst_reg, RRET);
1777
}
1778
break;
1779
case MemoryAccessSize::HalfWord:
1780
{
1781
sign ? armAsm->sxth(dst_reg, RRET) : armAsm->uxth(dst_reg, RRET);
1782
}
1783
break;
1784
case MemoryAccessSize::Word:
1785
{
1786
if (dst_reg.GetCode() != RRET.GetCode())
1787
armAsm->mov(dst_reg, RRET);
1788
}
1789
break;
1790
}
1791
1792
return dst_reg;
1793
}
1794
1795
void CPU::ARM32Recompiler::GenerateStore(const vixl::aarch32::Register& addr_reg,
1796
const vixl::aarch32::Register& value_reg, MemoryAccessSize size,
1797
bool use_fastmem)
1798
{
1799
if (use_fastmem)
1800
{
1801
DebugAssert(g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT);
1802
DebugAssert(addr_reg.GetCode() != RARG3.GetCode());
1803
const Register membase = GetMembaseReg();
1804
armAsm->lsr(RARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
1805
armAsm->ldr(RARG3, MemOperand(membase, RARG3, LSL, 2));
1806
1807
const MemOperand mem = MemOperand(RARG3, addr_reg);
1808
u8* start = armAsm->GetCursorAddress<u8*>();
1809
switch (size)
1810
{
1811
case MemoryAccessSize::Byte:
1812
armAsm->strb(value_reg, mem);
1813
break;
1814
1815
case MemoryAccessSize::HalfWord:
1816
armAsm->strh(value_reg, mem);
1817
break;
1818
1819
case MemoryAccessSize::Word:
1820
armAsm->str(value_reg, mem);
1821
break;
1822
}
1823
AddLoadStoreInfo(start, kA32InstructionSizeInBytes, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);
1824
return;
1825
}
1826
1827
if (addr_reg.GetCode() != RARG1.GetCode())
1828
armAsm->mov(RARG1, addr_reg);
1829
if (value_reg.GetCode() != RARG2.GetCode())
1830
armAsm->mov(RARG2, value_reg);
1831
1832
const bool checked = g_settings.cpu_recompiler_memory_exceptions;
1833
switch (size)
1834
{
1835
case MemoryAccessSize::Byte:
1836
{
1837
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryByte) :
1838
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte));
1839
}
1840
break;
1841
case MemoryAccessSize::HalfWord:
1842
{
1843
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryHalfWord) :
1844
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord));
1845
}
1846
break;
1847
case MemoryAccessSize::Word:
1848
{
1849
EmitCall(checked ? reinterpret_cast<const void*>(&RecompilerThunks::WriteMemoryWord) :
1850
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord));
1851
}
1852
break;
1853
}
1854
1855
// TODO: turn this into an asm function instead
1856
if (checked)
1857
{
1858
SwitchToFarCodeIfRegZeroOrNonZero(RRET, true);
1859
BackupHostState();
1860
1861
// Need to stash this in a temp because of the flush.
1862
const Register temp = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
1863
armAsm->lsl(temp, RRET, 2);
1864
1865
Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
1866
1867
// cause_bits = (result << 2) | BD | cop_n
1868
armAsm->orr(RARG1, temp,
1869
armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
1870
static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
1871
EmitMov(RARG2, m_current_instruction_pc);
1872
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
1873
FreeHostReg(temp.GetCode());
1874
EndBlock(std::nullopt, true);
1875
1876
RestoreHostState();
1877
SwitchToNearCode(false);
1878
}
1879
}
1880
1881
void CPU::ARM32Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1882
const std::optional<VirtualMemoryAddress>& address)
1883
{
1884
const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?
1885
std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :
1886
std::optional<Register>();
1887
FlushForLoadStore(address, false, use_fastmem);
1888
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
1889
const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() {
1890
if (cf.MipsT() == Reg::zero)
1891
return RRET;
1892
1893
return Register(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
1894
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT()));
1895
});
1896
1897
if (g_settings.gpu_pgxp_enable)
1898
{
1899
Flush(FLUSH_FOR_C_CALL);
1900
1901
EmitMov(RARG1, inst->bits);
1902
armAsm->mov(RARG2, addr);
1903
armAsm->mov(RARG3, data);
1904
EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
1905
FreeHostReg(addr_reg.value().GetCode());
1906
}
1907
}
1908
1909
void CPU::ARM32Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
1910
const std::optional<VirtualMemoryAddress>& address)
1911
{
1912
DebugAssert(size == MemoryAccessSize::Word && !sign);
1913
1914
const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
1915
FlushForLoadStore(address, false, use_fastmem);
1916
1917
// TODO: if address is constant, this can be simplified..
1918
1919
// If we're coming from another block, just flush the load delay and hope for the best..
1920
if (m_load_delay_dirty)
1921
UpdateLoadDelay();
1922
1923
// We'd need to be careful here if we weren't overwriting it..
1924
ComputeLoadStoreAddressArg(cf, address, addr);
1925
1926
// Do PGXP first, it does its own load.
1927
if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
1928
{
1929
Flush(FLUSH_FOR_C_CALL);
1930
EmitMov(RARG1, inst->bits);
1931
armAsm->mov(RARG2, addr);
1932
MoveMIPSRegToReg(RARG3, inst->r.rt, true);
1933
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
1934
}
1935
1936
armAsm->bic(RARG1, addr, 3);
1937
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
1938
1939
if (inst->r.rt == Reg::zero)
1940
{
1941
FreeHostReg(addr.GetCode());
1942
return;
1943
}
1944
1945
// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
1946
// never written back. NOTE: can't trust T in cf because of the flush
1947
const Reg rt = inst->r.rt;
1948
Register value;
1949
if (m_load_delay_register == rt)
1950
{
1951
const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
1952
AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
1953
m_load_delay_value_register;
1954
RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
1955
value = Register(existing_ld_rt);
1956
}
1957
else
1958
{
1959
if constexpr (EMULATE_LOAD_DELAYS)
1960
{
1961
value = Register(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
1962
if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
1963
armAsm->mov(value, Register(rtreg.value()));
1964
else if (HasConstantReg(rt))
1965
EmitMov(value, GetConstantRegU32(rt));
1966
else
1967
armAsm->ldr(value, MipsPtr(rt));
1968
}
1969
else
1970
{
1971
value = Register(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
1972
}
1973
}
1974
1975
DebugAssert(value.GetCode() != RARG2.GetCode() && value.GetCode() != RARG3.GetCode());
1976
armAsm->and_(RARG2, addr, 3);
1977
armAsm->lsl(RARG2, RARG2, 3); // *8
1978
EmitMov(RARG3, 24);
1979
armAsm->sub(RARG3, RARG3, RARG2);
1980
1981
if (inst->op == InstructionOp::lwl)
1982
{
1983
// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
1984
// new_value = (value & mask) | (RWRET << (24 - shift));
1985
EmitMov(RSCRATCH, 0xFFFFFFu);
1986
armAsm->lsr(RSCRATCH, RSCRATCH, RARG2);
1987
armAsm->and_(value, value, RSCRATCH);
1988
armAsm->lsl(RRET, RRET, RARG3);
1989
armAsm->orr(value, value, RRET);
1990
}
1991
else
1992
{
1993
// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
1994
// new_value = (value & mask) | (RWRET >> shift);
1995
armAsm->lsr(RRET, RRET, RARG2);
1996
EmitMov(RSCRATCH, 0xFFFFFF00u);
1997
armAsm->lsl(RSCRATCH, RSCRATCH, RARG3);
1998
armAsm->and_(value, value, RSCRATCH);
1999
armAsm->orr(value, value, RRET);
2000
}
2001
2002
FreeHostReg(addr.GetCode());
2003
}
2004
2005
void CPU::ARM32Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2006
const std::optional<VirtualMemoryAddress>& address)
2007
{
2008
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2009
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2010
const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?
2011
std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2012
std::optional<Register>();
2013
FlushForLoadStore(address, false, use_fastmem);
2014
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2015
const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {
2016
return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?
2017
Register(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2018
RRET;
2019
});
2020
2021
switch (action)
2022
{
2023
case GTERegisterAccessAction::Ignore:
2024
{
2025
break;
2026
}
2027
2028
case GTERegisterAccessAction::Direct:
2029
{
2030
armAsm->str(value, PTR(ptr));
2031
break;
2032
}
2033
2034
case GTERegisterAccessAction::SignExtend16:
2035
{
2036
armAsm->sxth(RARG3, value);
2037
armAsm->str(RARG3, PTR(ptr));
2038
break;
2039
}
2040
2041
case GTERegisterAccessAction::ZeroExtend16:
2042
{
2043
armAsm->uxth(RARG3, value);
2044
armAsm->str(RARG3, PTR(ptr));
2045
break;
2046
}
2047
2048
case GTERegisterAccessAction::CallHandler:
2049
{
2050
Flush(FLUSH_FOR_C_CALL);
2051
armAsm->mov(RARG2, value);
2052
EmitMov(RARG1, index);
2053
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2054
break;
2055
}
2056
2057
case GTERegisterAccessAction::PushFIFO:
2058
{
2059
// SXY0 <- SXY1
2060
// SXY1 <- SXY2
2061
// SXY2 <- SXYP
2062
DebugAssert(value.GetCode() != RARG2.GetCode() && value.GetCode() != RARG3.GetCode());
2063
armAsm->ldr(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
2064
armAsm->ldr(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
2065
armAsm->str(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
2066
armAsm->str(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
2067
armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));
2068
break;
2069
}
2070
2071
default:
2072
{
2073
Panic("Unknown action");
2074
return;
2075
}
2076
}
2077
2078
if (g_settings.gpu_pgxp_enable)
2079
{
2080
Flush(FLUSH_FOR_C_CALL);
2081
armAsm->mov(RARG3, value);
2082
if (value.GetCode() != RRET.GetCode())
2083
FreeHostReg(value.GetCode());
2084
armAsm->mov(RARG2, addr);
2085
FreeHostReg(addr_reg.value().GetCode());
2086
EmitMov(RARG1, inst->bits);
2087
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
2088
}
2089
}
2090
2091
void CPU::ARM32Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2092
const std::optional<VirtualMemoryAddress>& address)
2093
{
2094
AssertRegOrConstS(cf);
2095
AssertRegOrConstT(cf);
2096
2097
const std::optional<Register> addr_reg = g_settings.gpu_pgxp_enable ?
2098
std::optional<Register>(Register(AllocateTempHostReg(HR_CALLEE_SAVED))) :
2099
std::optional<Register>();
2100
FlushForLoadStore(address, true, use_fastmem);
2101
const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
2102
const Register data = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
2103
if (!cf.valid_host_t)
2104
MoveTToReg(RARG2, cf);
2105
2106
GenerateStore(addr, data, size, use_fastmem);
2107
2108
if (g_settings.gpu_pgxp_enable)
2109
{
2110
Flush(FLUSH_FOR_C_CALL);
2111
MoveMIPSRegToReg(RARG3, cf.MipsT());
2112
armAsm->mov(RARG2, addr);
2113
EmitMov(RARG1, inst->bits);
2114
EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
2115
FreeHostReg(addr_reg.value().GetCode());
2116
}
2117
}
2118
2119
void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2120
const std::optional<VirtualMemoryAddress>& address)
2121
{
2122
DebugAssert(size == MemoryAccessSize::Word && !sign);
2123
2124
// TODO: this can take over rt's value if it's no longer needed
2125
// NOTE: can't trust T in cf because of the alloc
2126
const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
2127
2128
FlushForLoadStore(address, true, use_fastmem);
2129
2130
// TODO: if address is constant, this can be simplified..
2131
// We'd need to be careful here if we weren't overwriting it..
2132
ComputeLoadStoreAddressArg(cf, address, addr);
2133
2134
if (g_settings.gpu_pgxp_enable)
2135
{
2136
Flush(FLUSH_FOR_C_CALL);
2137
EmitMov(RARG1, inst->bits);
2138
armAsm->mov(RARG2, addr);
2139
MoveMIPSRegToReg(RARG3, inst->r.rt);
2140
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
2141
}
2142
2143
armAsm->bic(RARG1, addr, 3);
2144
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
2145
2146
armAsm->and_(RSCRATCH, addr, 3);
2147
armAsm->lsl(RSCRATCH, RSCRATCH, 3); // *8
2148
armAsm->bic(addr, addr, 3);
2149
2150
MoveMIPSRegToReg(RARG2, inst->r.rt);
2151
2152
if (inst->op == InstructionOp::swl)
2153
{
2154
// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
2155
// new_value = (RWRET & mem_mask) | (value >> (24 - shift));
2156
EmitMov(RARG3, 0xFFFFFF00u);
2157
armAsm->lsl(RARG3, RARG3, RSCRATCH);
2158
armAsm->and_(RRET, RRET, RARG3);
2159
2160
EmitMov(RARG3, 24);
2161
armAsm->sub(RARG3, RARG3, RSCRATCH);
2162
armAsm->lsr(RARG2, RARG2, RARG3);
2163
armAsm->orr(RARG2, RARG2, RRET);
2164
}
2165
else
2166
{
2167
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
2168
// new_value = (RWRET & mem_mask) | (value << shift);
2169
armAsm->lsl(RARG2, RARG2, RSCRATCH);
2170
2171
EmitMov(RARG3, 24);
2172
armAsm->sub(RARG3, RARG3, RSCRATCH);
2173
EmitMov(RSCRATCH, 0x00FFFFFFu);
2174
armAsm->lsr(RSCRATCH, RSCRATCH, RARG3);
2175
armAsm->and_(RRET, RRET, RSCRATCH);
2176
armAsm->orr(RARG2, RARG2, RRET);
2177
}
2178
2179
GenerateStore(addr, RARG2, MemoryAccessSize::Word, use_fastmem);
2180
FreeHostReg(addr.GetCode());
2181
}
2182
2183
void CPU::ARM32Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
2184
const std::optional<VirtualMemoryAddress>& address)
2185
{
2186
const u32 index = static_cast<u32>(inst->r.rt.GetValue());
2187
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2188
const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?
2189
Register(AllocateTempHostReg(HR_CALLEE_SAVED)) :
2190
RARG1;
2191
const Register data = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
2192
FlushForLoadStore(address, true, use_fastmem);
2193
ComputeLoadStoreAddressArg(cf, address, addr);
2194
2195
switch (action)
2196
{
2197
case GTERegisterAccessAction::Direct:
2198
{
2199
armAsm->ldr(data, PTR(ptr));
2200
}
2201
break;
2202
2203
case GTERegisterAccessAction::CallHandler:
2204
{
2205
// should already be flushed.. except in fastmem case
2206
Flush(FLUSH_FOR_C_CALL);
2207
EmitMov(RARG1, index);
2208
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2209
armAsm->mov(data, RRET);
2210
}
2211
break;
2212
2213
default:
2214
{
2215
Panic("Unknown action");
2216
}
2217
break;
2218
}
2219
2220
GenerateStore(addr, data, size, use_fastmem);
2221
if (!g_settings.gpu_pgxp_enable)
2222
{
2223
if (addr.GetCode() != RARG1.GetCode())
2224
FreeHostReg(addr.GetCode());
2225
}
2226
else
2227
{
2228
// TODO: This can be simplified because we don't need to validate in PGXP..
2229
Flush(FLUSH_FOR_C_CALL);
2230
armAsm->mov(RARG3, data);
2231
FreeHostReg(data.GetCode());
2232
armAsm->mov(RARG2, addr);
2233
FreeHostReg(addr.GetCode());
2234
EmitMov(RARG1, inst->bits);
2235
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
2236
}
2237
}
2238
2239
void CPU::ARM32Recompiler::Compile_mtc0(CompileFlags cf)
2240
{
2241
// TODO: we need better constant setting here.. which will need backprop
2242
AssertRegOrConstT(cf);
2243
2244
const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
2245
const u32* ptr = GetCop0RegPtr(reg);
2246
const u32 mask = GetCop0RegWriteMask(reg);
2247
if (!ptr)
2248
{
2249
Compile_Fallback();
2250
return;
2251
}
2252
2253
if (mask == 0)
2254
{
2255
// if it's a read-only register, ignore
2256
DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));
2257
return;
2258
}
2259
2260
// for some registers, we need to test certain bits
2261
const bool needs_bit_test = (reg == Cop0Reg::SR);
2262
const Register new_value = RARG1;
2263
const Register old_value = RARG2;
2264
const Register changed_bits = RARG3;
2265
const Register mask_reg = RSCRATCH;
2266
2267
// Load old value
2268
armAsm->ldr(old_value, PTR(ptr));
2269
2270
// No way we fit this in an immediate..
2271
EmitMov(mask_reg, mask);
2272
2273
// update value
2274
if (cf.valid_host_t)
2275
armAsm->and_(new_value, CFGetRegT(cf), mask_reg);
2276
else
2277
EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
2278
2279
if (needs_bit_test)
2280
armAsm->eor(changed_bits, old_value, new_value);
2281
armAsm->bic(old_value, old_value, mask_reg);
2282
armAsm->orr(new_value, old_value, new_value);
2283
armAsm->str(new_value, PTR(ptr));
2284
2285
if (reg == Cop0Reg::SR)
2286
{
2287
// TODO: replace with register backup
2288
// We could just inline the whole thing..
2289
Flush(FLUSH_FOR_C_CALL);
2290
2291
Label caches_unchanged;
2292
armAsm->tst(changed_bits, 1u << 16);
2293
armAsm->b(eq, &caches_unchanged);
2294
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
2295
armAsm->ldr(RARG1, PTR(ptr)); // reload value for interrupt test below
2296
armAsm->bind(&caches_unchanged);
2297
2298
// might need to reload fastmem base too
2299
if (CodeCache::IsUsingFastmem() && m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions) &&
2300
IsHostRegAllocated(RMEMBASE.GetCode()))
2301
{
2302
FreeHostReg(RMEMBASE.GetCode());
2303
}
2304
2305
TestInterrupts(RARG1);
2306
}
2307
else if (reg == Cop0Reg::CAUSE)
2308
{
2309
armAsm->ldr(RARG1, PTR(&g_state.cop0_regs.sr.bits));
2310
TestInterrupts(RARG1);
2311
}
2312
else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)
2313
{
2314
// need to check whether we're switching to debug mode
2315
Flush(FLUSH_FOR_C_CALL);
2316
EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));
2317
SwitchToFarCodeIfRegZeroOrNonZero(RRET, true);
2318
BackupHostState();
2319
Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);
2320
EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return
2321
RestoreHostState();
2322
SwitchToNearCode(false);
2323
}
2324
}
2325
2326
void CPU::ARM32Recompiler::Compile_rfe(CompileFlags cf)
2327
{
2328
// shift mode bits right two, preserving upper bits
2329
armAsm->ldr(RARG1, PTR(&g_state.cop0_regs.sr.bits));
2330
armAsm->bic(RARG2, RARG1, 15);
2331
armAsm->ubfx(RARG1, RARG1, 2, 4);
2332
armAsm->orr(RARG1, RARG1, RARG2);
2333
armAsm->str(RARG1, PTR(&g_state.cop0_regs.sr.bits));
2334
2335
TestInterrupts(RARG1);
2336
}
2337
2338
void CPU::ARM32Recompiler::TestInterrupts(const vixl::aarch32::Register& sr)
2339
{
2340
// if Iec == 0 then goto no_interrupt
2341
Label no_interrupt;
2342
armAsm->tst(sr, 1);
2343
armAsm->b(eq, &no_interrupt);
2344
2345
// sr & cause
2346
armAsm->ldr(RSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
2347
armAsm->and_(sr, sr, RSCRATCH);
2348
2349
// ((sr & cause) & 0xff00) == 0 goto no_interrupt
2350
armAsm->tst(sr, 0xFF00);
2351
2352
SwitchToFarCode(true, ne);
2353
BackupHostState();
2354
2355
// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.
2356
UpdateLoadDelay();
2357
2358
Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
2359
2360
// Can't use EndBlockWithException() here, because it'll use the wrong PC.
2361
// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.
2362
if (!iinfo->is_last_instruction)
2363
{
2364
EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,
2365
(inst + 1)->cop.cop_n));
2366
EmitMov(RARG2, m_compiler_pc);
2367
EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
2368
m_dirty_pc = false;
2369
EndAndLinkBlock(std::nullopt, true, false);
2370
}
2371
else
2372
{
2373
EmitMov(RARG1, 0);
2374
if (m_dirty_pc)
2375
EmitMov(RARG2, m_compiler_pc);
2376
armAsm->str(RARG1, PTR(&g_state.downcount));
2377
if (m_dirty_pc)
2378
armAsm->str(RARG2, PTR(&g_state.pc));
2379
m_dirty_pc = false;
2380
EndAndLinkBlock(std::nullopt, false, true);
2381
}
2382
2383
RestoreHostState();
2384
SwitchToNearCode(false);
2385
2386
armAsm->bind(&no_interrupt);
2387
}
2388
2389
void CPU::ARM32Recompiler::Compile_mfc2(CompileFlags cf)
2390
{
2391
const u32 index = inst->cop.Cop2Index();
2392
const Reg rt = inst->r.rt;
2393
2394
const auto [ptr, action] = GetGTERegisterPointer(index, false);
2395
if (action == GTERegisterAccessAction::Ignore)
2396
return;
2397
2398
u32 hreg;
2399
if (action == GTERegisterAccessAction::Direct)
2400
{
2401
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2402
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2403
armAsm->ldr(Register(hreg), PTR(ptr));
2404
}
2405
else if (action == GTERegisterAccessAction::CallHandler)
2406
{
2407
Flush(FLUSH_FOR_C_CALL);
2408
EmitMov(RARG1, index);
2409
EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
2410
2411
hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
2412
EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
2413
armAsm->mov(Register(hreg), RRET);
2414
}
2415
else
2416
{
2417
Panic("Unknown action");
2418
return;
2419
}
2420
2421
if (g_settings.gpu_pgxp_enable)
2422
{
2423
Flush(FLUSH_FOR_C_CALL);
2424
EmitMov(RARG1, inst->bits);
2425
armAsm->mov(RARG2, Register(hreg));
2426
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
2427
}
2428
}
2429
2430
void CPU::ARM32Recompiler::Compile_mtc2(CompileFlags cf)
2431
{
2432
const u32 index = inst->cop.Cop2Index();
2433
const auto [ptr, action] = GetGTERegisterPointer(index, true);
2434
if (action == GTERegisterAccessAction::Ignore)
2435
return;
2436
2437
if (action == GTERegisterAccessAction::Direct)
2438
{
2439
if (cf.const_t)
2440
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
2441
else
2442
armAsm->str(CFGetRegT(cf), PTR(ptr));
2443
}
2444
else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
2445
{
2446
const bool sign = (action == GTERegisterAccessAction::SignExtend16);
2447
if (cf.valid_host_t)
2448
{
2449
sign ? armAsm->sxth(RARG1, CFGetRegT(cf)) : armAsm->uxth(RARG1, CFGetRegT(cf));
2450
armAsm->str(RARG1, PTR(ptr));
2451
}
2452
else if (cf.const_t)
2453
{
2454
const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
2455
StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
2456
}
2457
else
2458
{
2459
Panic("Unsupported setup");
2460
}
2461
}
2462
else if (action == GTERegisterAccessAction::CallHandler)
2463
{
2464
Flush(FLUSH_FOR_C_CALL);
2465
EmitMov(RARG1, index);
2466
MoveTToReg(RARG2, cf);
2467
EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
2468
}
2469
else if (action == GTERegisterAccessAction::PushFIFO)
2470
{
2471
// SXY0 <- SXY1
2472
// SXY1 <- SXY2
2473
// SXY2 <- SXYP
2474
DebugAssert(RRET.GetCode() != RARG2.GetCode() && RRET.GetCode() != RARG3.GetCode());
2475
armAsm->ldr(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
2476
armAsm->ldr(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
2477
armAsm->str(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
2478
armAsm->str(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
2479
if (cf.valid_host_t)
2480
armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
2481
else if (cf.const_t)
2482
StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
2483
else
2484
Panic("Unsupported setup");
2485
}
2486
else
2487
{
2488
Panic("Unknown action");
2489
}
2490
}
2491
2492
void CPU::ARM32Recompiler::Compile_cop2(CompileFlags cf)
2493
{
2494
TickCount func_ticks;
2495
GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
2496
2497
Flush(FLUSH_FOR_C_CALL);
2498
EmitMov(RARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
2499
EmitCall(reinterpret_cast<const void*>(func));
2500
2501
AddGTETicks(func_ticks);
2502
}
2503
2504
u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
2505
TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
2506
u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
2507
bool is_load)
2508
{
2509
Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
2510
Assembler* armAsm = &arm_asm;
2511
2512
#ifdef VIXL_DEBUG
2513
vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
2514
#endif
2515
2516
// save regs
2517
RegisterList save_regs;
2518
2519
for (u32 i = 0; i < NUM_HOST_REGS; i++)
2520
{
2521
if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
2522
save_regs.Combine(RegisterList(Register(i)));
2523
}
2524
2525
if (!save_regs.IsEmpty())
2526
armAsm->push(save_regs);
2527
2528
if (address_register != static_cast<u8>(RARG1.GetCode()))
2529
armAsm->mov(RARG1, Register(address_register));
2530
2531
if (!is_load)
2532
{
2533
if (data_register != static_cast<u8>(RARG2.GetCode()))
2534
armAsm->mov(RARG2, Register(data_register));
2535
}
2536
2537
if (cycles_to_add != 0)
2538
{
2539
// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
2540
armAsm->ldr(RARG3, PTR(&g_state.pending_ticks));
2541
if (!ImmediateA32::IsImmediateA32(cycles_to_add))
2542
{
2543
armEmitMov(armAsm, RSCRATCH, cycles_to_add);
2544
armAsm->add(RARG3, RARG3, RSCRATCH);
2545
}
2546
else
2547
{
2548
armAsm->add(RARG3, RARG3, cycles_to_add);
2549
}
2550
2551
armAsm->str(RARG3, PTR(&g_state.pending_ticks));
2552
}
2553
2554
switch (size)
2555
{
2556
case MemoryAccessSize::Byte:
2557
{
2558
armEmitCall(armAsm,
2559
is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryByte) :
2560
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryByte),
2561
false);
2562
}
2563
break;
2564
case MemoryAccessSize::HalfWord:
2565
{
2566
armEmitCall(armAsm,
2567
is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryHalfWord) :
2568
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryHalfWord),
2569
false);
2570
}
2571
break;
2572
case MemoryAccessSize::Word:
2573
{
2574
armEmitCall(armAsm,
2575
is_load ? reinterpret_cast<const void*>(&RecompilerThunks::UncheckedReadMemoryWord) :
2576
reinterpret_cast<const void*>(&RecompilerThunks::UncheckedWriteMemoryWord),
2577
false);
2578
}
2579
break;
2580
}
2581
2582
if (is_load)
2583
{
2584
const Register dst = Register(data_register);
2585
switch (size)
2586
{
2587
case MemoryAccessSize::Byte:
2588
{
2589
is_signed ? armAsm->sxtb(dst, RRET) : armAsm->uxtb(dst, RRET);
2590
}
2591
break;
2592
case MemoryAccessSize::HalfWord:
2593
{
2594
is_signed ? armAsm->sxth(dst, RRET) : armAsm->uxth(dst, RRET);
2595
}
2596
break;
2597
case MemoryAccessSize::Word:
2598
{
2599
if (dst.GetCode() != RRET.GetCode())
2600
armAsm->mov(dst, RRET);
2601
}
2602
break;
2603
}
2604
}
2605
2606
if (cycles_to_remove != 0)
2607
{
2608
armAsm->ldr(RARG3, PTR(&g_state.pending_ticks));
2609
if (!ImmediateA32::IsImmediateA32(cycles_to_remove))
2610
{
2611
armEmitMov(armAsm, RSCRATCH, cycles_to_remove);
2612
armAsm->sub(RARG3, RARG3, RSCRATCH);
2613
}
2614
else
2615
{
2616
armAsm->sub(RARG3, RARG3, cycles_to_remove);
2617
}
2618
armAsm->str(RARG3, PTR(&g_state.pending_ticks));
2619
}
2620
2621
// restore regs
2622
if (!save_regs.IsEmpty())
2623
armAsm->pop(save_regs);
2624
2625
armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);
2626
armAsm->FinalizeCode();
2627
2628
return static_cast<u32>(armAsm->GetCursorOffset());
2629
}
2630
2631
#endif // CPU_ARCH_ARM32
2632
2633