Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/src/common/gsvector_neon.h
4802 views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3
4
#include "common/intrin.h"
5
#include "common/types.h"
6
7
#include <algorithm>
8
#include <cmath>
9
#include <cstdint>
10
11
#define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1
12
#define GSVECTOR_HAS_SRLV 1
13
14
#ifdef CPU_ARCH_ARM64
15
// tbl2 with 128-bit vectors is not in A32.
16
#define GSVECTOR_HAS_TBL2 1
17
#endif
18
19
class GSVector2;
20
class GSVector2i;
21
class GSVector4;
22
class GSVector4i;
23
24
class alignas(16) GSVector2i
25
{
26
struct cxpr_init_tag
27
{
28
};
29
static constexpr cxpr_init_tag cxpr_init{};
30
31
constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {}
32
33
constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
34
35
constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
36
: S8{b0, b1, b2, b3, b4, b5, b6, b7}
37
{
38
}
39
40
public:
41
union
42
{
43
struct
44
{
45
s32 x, y;
46
};
47
struct
48
{
49
s32 r, g;
50
};
51
float F32[2];
52
s8 S8[8];
53
s16 S16[4];
54
s32 S32[2];
55
s64 S64[1];
56
u8 U8[8];
57
u16 U16[4];
58
u32 U32[2];
59
u64 U64[1];
60
int32x2_t v2s;
61
};
62
63
GSVector2i() = default;
64
65
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
66
67
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
68
69
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
70
71
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
72
{
73
return GSVector2i(cxpr_init, s0, s1, s2, s3);
74
}
75
76
ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
77
{
78
return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
79
}
80
81
ALWAYS_INLINE GSVector2i(s32 x, s32 y) { v2s = vset_lane_s32(y, vdup_n_s32(x), 1); }
82
83
ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
84
85
ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
86
: S8{b0, b1, b2, b3, b4, b5, b6, b7}
87
{
88
}
89
90
ALWAYS_INLINE explicit GSVector2i(int i) { *this = i; }
91
92
ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {}
93
94
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
95
96
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
97
98
ALWAYS_INLINE void operator=(int i) { v2s = vdup_n_s32(i); }
99
100
ALWAYS_INLINE operator int32x2_t() const { return v2s; }
101
102
ALWAYS_INLINE GSVector2i sat_s8(const GSVector2i& min, const GSVector2i& max) const
103
{
104
return max_s8(min).min_s8(max);
105
}
106
ALWAYS_INLINE GSVector2i sat_s16(const GSVector2i& min, const GSVector2i& max) const
107
{
108
return max_s16(min).min_s16(max);
109
}
110
ALWAYS_INLINE GSVector2i sat_s32(const GSVector2i& min, const GSVector2i& max) const
111
{
112
return max_s32(min).min_s32(max);
113
}
114
115
ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
116
{
117
return max_u8(min).min_u8(max);
118
}
119
ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
120
{
121
return max_u16(min).min_u16(max);
122
}
123
ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
124
{
125
return max_u32(min).min_u32(max);
126
}
127
128
ALWAYS_INLINE GSVector2i min_s8(const GSVector2i& v) const
129
{
130
return GSVector2i(vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
131
}
132
133
ALWAYS_INLINE GSVector2i max_s8(const GSVector2i& v) const
134
{
135
return GSVector2i(vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
136
}
137
138
ALWAYS_INLINE GSVector2i min_s16(const GSVector2i& v) const
139
{
140
return GSVector2i(vreinterpret_s32_s16(vmin_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
141
}
142
143
ALWAYS_INLINE GSVector2i max_s16(const GSVector2i& v) const
144
{
145
return GSVector2i(vreinterpret_s32_s16(vmax_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
146
}
147
148
ALWAYS_INLINE GSVector2i min_s32(const GSVector2i& v) const { return GSVector2i(vmin_s32(v2s, v.v2s)); }
149
150
ALWAYS_INLINE GSVector2i max_s32(const GSVector2i& v) const { return GSVector2i(vmax_s32(v2s, v.v2s)); }
151
152
ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const
153
{
154
return GSVector2i(vreinterpret_s32_u8(vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
155
}
156
157
ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const
158
{
159
return GSVector2i(vreinterpret_s32_u8(vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
160
}
161
162
ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const
163
{
164
return GSVector2i(vreinterpret_s32_u16(vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
165
}
166
167
ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const
168
{
169
return GSVector2i(vreinterpret_s32_u16(vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
170
}
171
172
ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const
173
{
174
return GSVector2i(vreinterpret_s32_u32(vmin_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
175
}
176
177
ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const
178
{
179
return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
180
}
181
182
ALWAYS_INLINE s32 addv_s32() const
183
{
184
#ifdef CPU_ARCH_ARM64
185
return vaddv_s32(v2s);
186
#else
187
return vget_lane_s32(v2s, 0) + vget_lane_s32(v2s, 1);
188
#endif
189
}
190
191
#ifdef CPU_ARCH_ARM64
192
193
ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); }
194
195
ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); }
196
197
ALWAYS_INLINE u16 minv_u16() const { return vminv_u16(vreinterpret_u16_s32(v2s)); }
198
199
ALWAYS_INLINE u16 maxv_u16() const { return vmaxv_u16(vreinterpret_u16_s32(v2s)); }
200
201
ALWAYS_INLINE s32 minv_s32() const { return vminv_s32(v2s); }
202
203
ALWAYS_INLINE u32 minv_u32() const { return vminv_u32(v2s); }
204
205
ALWAYS_INLINE s32 maxv_s32() const { return vmaxv_s32(v2s); }
206
207
ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); }
208
209
#else
210
211
ALWAYS_INLINE u8 minv_u8() const
212
{
213
uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
214
return static_cast<u8>(
215
std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
216
std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
217
std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
218
}
219
220
ALWAYS_INLINE u16 maxv_u8() const
221
{
222
uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
223
return static_cast<u8>(
224
std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
225
std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
226
std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
227
}
228
229
ALWAYS_INLINE u16 minv_u16() const
230
{
231
uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
232
return static_cast<u16>(
233
std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
234
}
235
236
ALWAYS_INLINE u16 maxv_u16() const
237
{
238
uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
239
return static_cast<u16>(
240
std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
241
}
242
243
ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
244
245
ALWAYS_INLINE u32 minv_u32() const
246
{
247
return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
248
}
249
250
ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
251
252
ALWAYS_INLINE u32 maxv_u32() const
253
{
254
return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
255
}
256
257
#endif
258
259
ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
260
261
ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const
262
{
263
uint8x8_t mask2 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_s32(mask.v2s), 7));
264
return GSVector2i(vreinterpret_s32_u8(vbsl_u8(mask2, vreinterpret_u8_s32(a.v2s), vreinterpret_u8_s32(v2s))));
265
}
266
267
template<int mask>
268
ALWAYS_INLINE GSVector2i blend16(const GSVector2i& a) const
269
{
270
static constexpr const uint16_t _mask[4] = {
271
((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
272
((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0};
273
return GSVector2i(
274
vreinterpret_s32_u16(vbsl_u16(vld1_u16(_mask), vreinterpret_u16_s32(a.v2s), vreinterpret_u16_s32(v2s))));
275
}
276
277
template<int mask>
278
ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
279
{
280
constexpr int bit1 = ((mask & 2) * 3) << 1;
281
constexpr int bit0 = (mask & 1) * 3;
282
return blend16 < bit1 | bit0 > (v);
283
}
284
285
ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
286
{
287
return GSVector2i(vreinterpret_s32_s8(vorr_s8(vbic_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(mask.v2s)),
288
vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s)))));
289
}
290
291
ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const
292
{
293
return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s))));
294
}
295
296
ALWAYS_INLINE GSVector2i ps16() const
297
{
298
return GSVector2i(vreinterpret_s32_s8(vqmovn_s16(vcombine_s16(vreinterpret_s16_s32(v2s), vcreate_s16(0)))));
299
}
300
301
ALWAYS_INLINE GSVector2i pu16() const
302
{
303
return GSVector2i(vreinterpret_s32_u8(vqmovn_u16(vcombine_u16(vreinterpret_u16_s32(v2s), vcreate_u16(0)))));
304
}
305
306
ALWAYS_INLINE GSVector2i ps32() const
307
{
308
return GSVector2i(vreinterpret_s32_s16(vqmovn_s16(vcombine_s32(v2s, vcreate_s32(0)))));
309
}
310
311
ALWAYS_INLINE GSVector2i pu32() const
312
{
313
return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0)))));
314
}
315
316
#ifdef CPU_ARCH_ARM64
317
318
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
319
{
320
return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
321
}
322
323
ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
324
{
325
return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
326
}
327
ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip1_s32(v2s, v.v2s)); }
328
329
ALWAYS_INLINE GSVector2i upl8() const
330
{
331
return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0))));
332
}
333
334
ALWAYS_INLINE GSVector2i upl16() const
335
{
336
return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0))));
337
}
338
339
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); }
340
341
#else
342
343
ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
344
{
345
return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0]));
346
}
347
348
ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
349
{
350
return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0]));
351
}
352
ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); }
353
354
ALWAYS_INLINE GSVector2i upl8() const
355
{
356
return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0]));
357
}
358
359
ALWAYS_INLINE GSVector2i upl16() const
360
{
361
return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0]));
362
}
363
364
ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); }
365
366
#endif
367
368
ALWAYS_INLINE GSVector2i s8to16() const
369
{
370
return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s)))));
371
}
372
373
ALWAYS_INLINE GSVector2i u8to16() const
374
{
375
return GSVector2i(vreinterpret_s32_u16(vget_low_u8(vmovl_u8(vreinterpret_u8_s32(v2s)))));
376
}
377
378
ALWAYS_INLINE GSVector2i s8to32() const
379
{
380
return GSVector2i(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(v2s))))));
381
}
382
383
ALWAYS_INLINE GSVector2i u8to32() const
384
{
385
return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s32(v2s)))))));
386
}
387
388
ALWAYS_INLINE GSVector2i s16to32() const { return GSVector2i(vget_low_s32(vmovl_s16(vreinterpret_s16_s32(v2s)))); }
389
390
ALWAYS_INLINE GSVector2i u16to32() const
391
{
392
return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s32(v2s)))));
393
}
394
395
template<int i>
396
ALWAYS_INLINE GSVector2i srl() const
397
{
398
return GSVector2i(vreinterpret_s32_s8(vext_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0), i)));
399
}
400
401
template<int i>
402
ALWAYS_INLINE GSVector2i sll() const
403
{
404
return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i)));
405
}
406
407
template<int i>
408
ALWAYS_INLINE GSVector2i sll16() const
409
{
410
return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i)));
411
}
412
413
ALWAYS_INLINE GSVector2i sll16(s32 i) const
414
{
415
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i))));
416
}
417
418
ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const
419
{
420
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
421
}
422
423
template<int i>
424
ALWAYS_INLINE GSVector2i srl16() const
425
{
426
return GSVector2i(vreinterpret_s32_u16(vshr_n_u16(vreinterpret_u16_s32(v2s), i)));
427
}
428
429
ALWAYS_INLINE GSVector2i srl16(s32 i) const
430
{
431
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_u16(-i))));
432
}
433
434
ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const
435
{
436
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
437
}
438
439
template<int i>
440
ALWAYS_INLINE GSVector2i sra16() const
441
{
442
constexpr int count = (i & ~15) ? 15 : i;
443
return GSVector2i(vreinterpret_s32_s16(vshr_n_s16(vreinterpret_s16_s32(v2s), count)));
444
}
445
446
ALWAYS_INLINE GSVector2i sra16(s32 i) const
447
{
448
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(-i))));
449
}
450
451
ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const
452
{
453
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
454
}
455
456
template<int i>
457
ALWAYS_INLINE GSVector2i sll32() const
458
{
459
return GSVector2i(vshl_n_s32(v2s, i));
460
}
461
462
ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); }
463
464
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); }
465
466
template<int i>
467
ALWAYS_INLINE GSVector2i srl32() const
468
{
469
return GSVector2i(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(v2s), i)));
470
}
471
472
ALWAYS_INLINE GSVector2i srl32(s32 i) const
473
{
474
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(-i))));
475
}
476
477
ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const
478
{
479
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s))));
480
}
481
482
template<int i>
483
ALWAYS_INLINE GSVector2i sra32() const
484
{
485
return GSVector2i(vshr_n_s32(v2s, i));
486
}
487
488
ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(-i))); }
489
490
ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const
491
{
492
return GSVector2i(vshl_s32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s)));
493
}
494
495
ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const
496
{
497
return GSVector2i(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
498
}
499
500
ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const
501
{
502
return GSVector2i(vreinterpret_s32_s16(vadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
503
}
504
505
ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(vadd_s32(v2s, v.v2s)); }
506
507
ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const
508
{
509
return GSVector2i(vreinterpret_s32_s8(vqadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
510
}
511
512
ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const
513
{
514
return GSVector2i(vreinterpret_s32_s16(vqadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
515
}
516
517
ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const
518
{
519
return GSVector2i(vreinterpret_s32_u8(vqadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
520
}
521
522
ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const
523
{
524
return GSVector2i(vreinterpret_s32_u16(vqadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
525
}
526
527
ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const
528
{
529
return GSVector2i(vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
530
}
531
532
ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const
533
{
534
return GSVector2i(vreinterpret_s32_s16(vsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
535
}
536
537
ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(vsub_s32(v2s, v.v2s)); }
538
539
ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const
540
{
541
return GSVector2i(vreinterpret_s32_s8(vqsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
542
}
543
544
ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const
545
{
546
return GSVector2i(vreinterpret_s32_s16(vqsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
547
}
548
549
ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const
550
{
551
return GSVector2i(vreinterpret_s32_u8(vqsub_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
552
}
553
554
ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const
555
{
556
return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
557
}
558
559
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
560
{
561
return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
562
}
563
564
ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(vmul_s32(v2s, v.v2s)); }
565
566
ALWAYS_INLINE bool eq(const GSVector2i& v) const
567
{
568
return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0);
569
}
570
571
ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const
572
{
573
return GSVector2i(vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
574
}
575
576
ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const
577
{
578
return GSVector2i(vreinterpret_s32_u16(vceq_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
579
}
580
581
ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const
582
{
583
return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s)));
584
}
585
586
ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
587
588
ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
589
590
ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); }
591
592
ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const
593
{
594
return GSVector2i(vreinterpret_s32_s8(vcgt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
595
}
596
597
ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const
598
{
599
return GSVector2i(vreinterpret_s32_s16(vcgt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
600
}
601
602
ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(vcgt_s32(v2s, v.v2s)); }
603
604
ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const
605
{
606
return GSVector2i(vreinterpret_s32_s8(vcge_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
607
}
608
ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const
609
{
610
return GSVector2i(vreinterpret_s32_s16(vcge_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
611
}
612
ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return GSVector2i(vcge_s32(v2s, v.v2s)); }
613
614
ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const
615
{
616
return GSVector2i(vreinterpret_s32_s8(vclt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
617
}
618
619
ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const
620
{
621
return GSVector2i(vreinterpret_s32_s16(vclt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
622
}
623
624
ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(vclt_s32(v2s, v.v2s)); }
625
626
ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const
627
{
628
return GSVector2i(vreinterpret_s32_s8(vcle_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
629
}
630
ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const
631
{
632
return GSVector2i(vreinterpret_s32_s16(vcle_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
633
}
634
ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return GSVector2i(vcle_s32(v2s, v.v2s)); }
635
636
ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(vbic_s32(v2s, v.v2s)); }
637
638
ALWAYS_INLINE int mask() const
639
{
640
// borrowed from sse2neon
641
const uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(vreinterpret_u8_s32(v2s), 7));
642
const uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
643
const uint64x1_t paired32 = vreinterpret_u64_u32(vsra_n_u32(paired16, paired16, 14));
644
const uint8x8_t paired64 = vreinterpret_u8_u64(vsra_n_u64(paired32, paired32, 28));
645
return static_cast<int>(vget_lane_u8(paired64, 0));
646
}
647
648
ALWAYS_INLINE bool alltrue() const
649
{
650
return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
651
}
652
653
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0)); }
654
655
template<int i>
656
ALWAYS_INLINE GSVector2i insert8(int a) const
657
{
658
return GSVector2i(vreinterpret_s32_u8(vset_lane_u8(a, vreinterpret_u8_s32(v2s), static_cast<uint8_t>(i))));
659
}
660
661
template<int i>
662
ALWAYS_INLINE int extract8() const
663
{
664
return vget_lane_u8(vreinterpret_u8_s32(v2s), i);
665
}
666
667
template<int i>
668
ALWAYS_INLINE GSVector2i insert16(int a) const
669
{
670
return GSVector2i(vreinterpret_s32_u16(vset_lane_u16(a, vreinterpret_u16_s32(v2s), static_cast<uint16_t>(i))));
671
}
672
673
template<int i>
674
ALWAYS_INLINE int extract16() const
675
{
676
return vget_lane_u16(vreinterpret_u16_s32(v2s), i);
677
}
678
679
template<int i>
680
ALWAYS_INLINE GSVector2i insert32(int a) const
681
{
682
return GSVector2i(vset_lane_s32(a, v2s, i));
683
}
684
685
template<int i>
686
ALWAYS_INLINE int extract32() const
687
{
688
return vget_lane_s32(v2s, i);
689
}
690
691
ALWAYS_INLINE static GSVector2i load32(const void* p)
692
{
693
// should be ldr s0, [x0]
694
u32 val;
695
std::memcpy(&val, p, sizeof(u32));
696
return GSVector2i(vset_lane_u32(val, vdup_n_u32(0), 0));
697
}
698
699
ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
700
701
template<bool aligned>
702
ALWAYS_INLINE static GSVector2i load(const void* p)
703
{
704
#ifdef CPU_ARCH_ARM32
705
if constexpr (!aligned)
706
return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
707
#endif
708
709
return GSVector2i(vld1_s32((const int32_t*)p));
710
}
711
712
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
713
{
714
s32 val = vget_lane_s32(v, 0);
715
std::memcpy(p, &val, sizeof(s32));
716
}
717
718
template<bool aligned>
719
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
720
{
721
#ifdef CPU_ARCH_ARM32
722
if constexpr (!aligned)
723
{
724
vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
725
return;
726
}
727
#endif
728
729
vst1_s32((int32_t*)p, v.v2s);
730
}
731
732
ALWAYS_INLINE void operator&=(const GSVector2i& v)
733
{
734
v2s = vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
735
}
736
737
ALWAYS_INLINE void operator|=(const GSVector2i& v)
738
{
739
v2s = vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
740
}
741
742
ALWAYS_INLINE void operator^=(const GSVector2i& v)
743
{
744
v2s = vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
745
}
746
747
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
748
{
749
return GSVector2i(vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
750
}
751
752
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
753
{
754
return GSVector2i(vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
755
}
756
757
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
758
{
759
return GSVector2i(vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
760
}
761
762
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, int i) { return v & GSVector2i(i); }
763
764
ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, int i) { return v | GSVector2i(i); }
765
766
ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, int i) { return v ^ GSVector2i(i); }
767
768
ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return GSVector2i(vmvn_s32(v.v2s)); }
769
770
ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(0); }
771
772
ALWAYS_INLINE GSVector2i xy() const { return *this; }
773
ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 0, 0)); }
774
ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 0)); }
775
ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 1)); }
776
};
777
778
class alignas(16) GSVector2
779
{
780
struct cxpr_init_tag
781
{
782
};
783
static constexpr cxpr_init_tag cxpr_init{};
784
785
constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
786
787
constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
788
789
public:
790
union
791
{
792
struct
793
{
794
float x, y;
795
};
796
struct
797
{
798
float r, g;
799
};
800
float F32[2];
801
double F64[1];
802
s8 I8[8];
803
s16 I16[4];
804
s32 I32[2];
805
s64 I64[1];
806
u8 U8[8];
807
u16 U16[4];
808
u32 U32[2];
809
u64 U64[1];
810
float32x2_t v2s;
811
};
812
813
GSVector2() = default;
814
815
constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
816
817
constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
818
819
constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
820
821
constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
822
823
ALWAYS_INLINE GSVector2(float x, float y) : v2s(vset_lane_f32(y, vdup_n_f32(x), 1)) {}
824
825
ALWAYS_INLINE GSVector2(int x, int y) : v2s(vcvt_f32_s32(vset_lane_s32(y, vdup_n_s32(x), 1))) {}
826
827
ALWAYS_INLINE constexpr explicit GSVector2(float32x2_t m) : v2s(m) {}
828
829
ALWAYS_INLINE explicit GSVector2(float f) { v2s = vdup_n_f32(f); }
830
831
ALWAYS_INLINE explicit GSVector2(int i) { v2s = vcvt_f32_s32(vdup_n_s32(i)); }
832
833
ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
834
835
ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
836
837
ALWAYS_INLINE void operator=(float f) { v2s = vdup_n_f32(f); }
838
839
ALWAYS_INLINE void operator=(float32x2_t m) { v2s = m; }
840
841
ALWAYS_INLINE operator float32x2_t() const { return v2s; }
842
843
ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
844
ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
845
846
#ifdef CPU_ARCH_ARM64
847
848
ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
849
ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
850
851
#else
852
853
ALWAYS_INLINE GSVector2 floor() const
854
{
855
return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1)));
856
}
857
858
ALWAYS_INLINE GSVector2 ceil() const
859
{
860
return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1)));
861
}
862
863
#endif
864
865
ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); }
866
867
ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
868
869
ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
870
871
ALWAYS_INLINE GSVector2 min(const GSVector2& a) const { return GSVector2(vmin_f32(v2s, a.v2s)); }
872
873
ALWAYS_INLINE GSVector2 max(const GSVector2& a) const { return GSVector2(vmax_f32(v2s, a.v2s)); }
874
875
template<int mask>
876
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const
877
{
878
return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1));
879
}
880
881
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const
882
{
883
// duplicate sign bit across and bit select
884
const uint32x2_t bitmask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(mask.v2s), 31));
885
return GSVector2(vbsl_f32(bitmask, a.v2s, v2s));
886
}
887
888
ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const
889
{
890
return GSVector2(vreinterpret_f32_s32(vbic_s32(vreinterpret_s32_f32(v2s), vreinterpret_s32_f32(v.v2s))));
891
}
892
893
ALWAYS_INLINE int mask() const
894
{
895
const uint32x2_t masks = vshr_n_u32(vreinterpret_u32_s32(v2s), 31);
896
return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1));
897
}
898
899
ALWAYS_INLINE bool alltrue() const
900
{
901
return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
902
}
903
904
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0)); }
905
906
ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
907
908
template<int src, int dst>
909
ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
910
{
911
#ifdef CPU_ARCH_ARM64
912
return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src));
913
#else
914
return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst));
915
#endif
916
}
917
918
template<int i>
919
ALWAYS_INLINE int extract32() const
920
{
921
return vget_lane_s32(vreinterpret_s32_f32(v2s), i);
922
}
923
924
ALWAYS_INLINE float dot(const GSVector2& v) const
925
{
926
#ifdef CPU_ARCH_ARM64
927
return vaddv_f32(vmul_f32(v2s, v.v2s));
928
#else
929
const float32x2_t dp = vmul_f32(v2s, v.v2s);
930
return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0);
931
#endif
932
}
933
934
ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); }
935
936
ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }
937
938
template<bool aligned>
939
ALWAYS_INLINE static GSVector2 load(const void* p)
940
{
941
#ifdef CPU_ARCH_ARM32
942
if constexpr (!aligned)
943
return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
944
#endif
945
946
return GSVector2(vld1_f32(static_cast<const float*>(p)));
947
}
948
949
template<bool aligned>
950
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
951
{
952
#ifdef CPU_ARCH_ARM32
953
if constexpr (!aligned)
954
{
955
vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
956
return;
957
}
958
#endif
959
960
vst1_f32(static_cast<float*>(p), v.v2s);
961
}
962
963
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
964
965
ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); }
966
ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); }
967
ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); }
968
ALWAYS_INLINE void operator/=(const GSVector2& v)
969
{
970
#ifdef CPU_ARCH_ARM64
971
v2s = vdiv_f32(v2s, v.v2s);
972
#else
973
*this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1));
974
#endif
975
}
976
977
ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
978
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
979
ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); }
980
ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); }
981
982
ALWAYS_INLINE void operator&=(const GSVector2& v)
983
{
984
v2s = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
985
}
986
987
ALWAYS_INLINE void operator|=(const GSVector2& v)
988
{
989
v2s = vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
990
}
991
992
ALWAYS_INLINE void operator^=(const GSVector2& v)
993
{
994
v2s = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
995
}
996
997
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
998
{
999
return GSVector2(vadd_f32(v1.v2s, v2.v2s));
1000
}
1001
1002
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2)
1003
{
1004
return GSVector2(vsub_f32(v1.v2s, v2.v2s));
1005
}
1006
1007
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2)
1008
{
1009
return GSVector2(vmul_f32(v1.v2s, v2.v2s));
1010
}
1011
1012
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
1013
{
1014
#ifdef CPU_ARCH_ARM64
1015
return GSVector2(vdiv_f32(v1.v2s, v2.v2s));
1016
#else
1017
return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0),
1018
vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1));
1019
#endif
1020
}
1021
1022
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
1023
ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); }
1024
ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); }
1025
ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); }
1026
1027
ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
1028
{
1029
return GSVector2(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1030
}
1031
1032
ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
1033
{
1034
return GSVector2(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1035
}
1036
1037
ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
1038
{
1039
return GSVector2(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1040
}
1041
1042
ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
1043
{
1044
return GSVector2(vreinterpret_f32_u32(vceq_f32(v1.v2s, v2.v2s)));
1045
}
1046
1047
ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
1048
{
1049
// NEON has no !=
1050
return GSVector2(vreinterpret_f32_u32(vmvn_u32(vceq_f32(v1.v2s, v2.v2s))));
1051
}
1052
1053
ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
1054
{
1055
return GSVector2(vreinterpret_f32_u32(vcgt_f32(v1.v2s, v2.v2s)));
1056
}
1057
1058
ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
1059
{
1060
return GSVector2(vreinterpret_f32_u32(vclt_f32(v1.v2s, v2.v2s)));
1061
}
1062
1063
ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
1064
{
1065
return GSVector2(vreinterpret_f32_u32(vcge_f32(v1.v2s, v2.v2s)));
1066
}
1067
1068
ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
1069
{
1070
return GSVector2(vreinterpret_f32_u32(vcle_f32(v1.v2s, v2.v2s)));
1071
}
1072
1073
ALWAYS_INLINE GSVector2 xy() const { return *this; }
1074
ALWAYS_INLINE GSVector2 xx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 0, 0)); }
1075
ALWAYS_INLINE GSVector2 yx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 0)); }
1076
ALWAYS_INLINE GSVector2 yy() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 1)); }
1077
};
1078
1079
class alignas(16) GSVector4i
1080
{
1081
struct cxpr_init_tag
1082
{
1083
};
1084
static constexpr cxpr_init_tag cxpr_init{};
1085
1086
constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
1087
1088
constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1089
: S16{s0, s1, s2, s3, s4, s5, s6, s7}
1090
{
1091
}
1092
1093
constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
1094
s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1095
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1096
{
1097
}
1098
1099
public:
1100
union
1101
{
1102
struct
1103
{
1104
int x, y, z, w;
1105
};
1106
struct
1107
{
1108
int r, g, b, a;
1109
};
1110
struct
1111
{
1112
int left, top, right, bottom;
1113
};
1114
float F32[4];
1115
s8 S8[16];
1116
s16 S16[8];
1117
s32 S32[4];
1118
s64 S64[2];
1119
u8 U8[16];
1120
u16 U16[8];
1121
u32 U32[4];
1122
u64 U64[2];
1123
int32x4_t v4s;
1124
};
1125
1126
GSVector4i() = default;
1127
1128
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
1129
{
1130
return GSVector4i(cxpr_init, x, y, z, w);
1131
}
1132
1133
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
1134
1135
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
1136
1137
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1138
{
1139
return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
1140
}
1141
1142
ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
1143
s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1144
{
1145
return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1146
}
1147
1148
ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w)
1149
: v4s(vsetq_lane_s32(w, vsetq_lane_s32(z, vsetq_lane_s32(y, vdupq_n_s32(x), 1), 2), 3))
1150
{
1151
}
1152
1153
ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1154
: S16{s0, s1, s2, s3, s4, s5, s6, s7}
1155
{
1156
}
1157
1158
constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12,
1159
s8 b13, s8 b14, s8 b15)
1160
: S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1161
{
1162
}
1163
1164
ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : v4s(vcombine_s32(v.v2s, vcreate_s32(0))) {}
1165
1166
ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
1167
1168
ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
1169
1170
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v) : v4s(vcombine_s32(vcvt_s32_f32(v.v2s), vcreate_s32(0))) {}
1171
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
1172
1173
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
1174
1175
ALWAYS_INLINE void operator=(s32 i) { v4s = vdupq_n_s32(i); }
1176
1177
ALWAYS_INLINE operator int32x4_t() const { return v4s; }
1178
1179
// rect
1180
1181
ALWAYS_INLINE s32 width() const { return right - left; }
1182
ALWAYS_INLINE s32 height() const { return bottom - top; }
1183
1184
ALWAYS_INLINE GSVector2i rsize() const { return zwzw().sub32(xyxy()).xy(); }
1185
1186
ALWAYS_INLINE bool rempty() const
1187
{
1188
// !any((x, y) < (z, w)) i.e. !not_empty
1189
return (vget_lane_u64(vreinterpret_u64_u32(vclt_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) !=
1190
0xFFFFFFFFFFFFFFFFULL);
1191
}
1192
1193
ALWAYS_INLINE bool rvalid() const
1194
{
1195
// !all((x, y) >= (z, w))
1196
return (vget_lane_u64(vreinterpret_u64_u32(vcge_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == 0);
1197
}
1198
1199
ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_s32(a).upl64(max_s32(a).srl<8>()); }
1200
1201
ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_s32(a); }
1202
ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return rintersect(v).rvalid(); }
1203
ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
1204
1205
ALWAYS_INLINE u32 rgba32() const { return static_cast<u32>(ps32().pu16().extract32<0>()); }
1206
1207
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& min, const GSVector4i& max) const
1208
{
1209
return max_s8(min).min_s8(max);
1210
}
1211
ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& minmax) const
1212
{
1213
return max_s8(minmax.xyxy()).min_s8(minmax.zwzw());
1214
}
1215
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& min, const GSVector4i& max) const
1216
{
1217
return max_s16(min).min_s16(max);
1218
}
1219
ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& minmax) const
1220
{
1221
return max_s16(minmax.xyxy()).min_s16(minmax.zwzw());
1222
}
1223
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& min, const GSVector4i& max) const
1224
{
1225
return max_s32(min).min_s32(max);
1226
}
1227
ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& minmax) const
1228
{
1229
return max_s32(minmax.xyxy()).min_s32(minmax.zwzw());
1230
}
1231
1232
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
1233
{
1234
return max_u8(min).min_u8(max);
1235
}
1236
ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
1237
{
1238
return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
1239
}
1240
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
1241
{
1242
return max_u16(min).min_u16(max);
1243
}
1244
ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
1245
{
1246
return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
1247
}
1248
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
1249
{
1250
return max_u32(min).min_u32(max);
1251
}
1252
ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
1253
{
1254
return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
1255
}
1256
1257
ALWAYS_INLINE GSVector4i min_s8(const GSVector4i& v) const
1258
{
1259
return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1260
}
1261
1262
ALWAYS_INLINE GSVector4i max_s8(const GSVector4i& v) const
1263
{
1264
return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1265
}
1266
1267
ALWAYS_INLINE GSVector4i min_s16(const GSVector4i& v) const
1268
{
1269
return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1270
}
1271
1272
ALWAYS_INLINE GSVector4i max_s16(const GSVector4i& v) const
1273
{
1274
return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1275
}
1276
1277
ALWAYS_INLINE GSVector4i min_s32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); }
1278
1279
ALWAYS_INLINE GSVector4i max_s32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); }
1280
1281
ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const
1282
{
1283
return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1284
}
1285
1286
ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const
1287
{
1288
return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1289
}
1290
1291
ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const
1292
{
1293
return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1294
}
1295
1296
ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const
1297
{
1298
return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1299
}
1300
1301
ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const
1302
{
1303
return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1304
}
1305
1306
ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const
1307
{
1308
return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1309
}
1310
1311
ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
1312
{
1313
#ifdef CPU_ARCH_ARM64
1314
const int32x4_t acc =
1315
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1316
return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
1317
#else
1318
// borrowed from sse2neon
1319
const int32x4_t low =
1320
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1321
const int32x4_t high =
1322
vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1323
return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
1324
vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
1325
#endif
1326
}
1327
1328
ALWAYS_INLINE GSVector4i addp_s32() const
1329
{
1330
#ifdef CPU_ARCH_ARM64
1331
return GSVector4i(vpaddq_s32(v4s, v4s));
1332
#else
1333
const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1334
return GSVector4i(vcombine_s32(res, res));
1335
#endif
1336
}
1337
1338
ALWAYS_INLINE s32 addv_s32() const
1339
{
1340
#ifdef CPU_ARCH_ARM64
1341
return vaddvq_s32(v4s);
1342
#else
1343
const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1344
return vget_lane_s32(res, 0) + vget_lane_s32(res, 1);
1345
#endif
1346
}
1347
1348
#ifdef CPU_ARCH_ARM64
1349
1350
ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); }
1351
1352
ALWAYS_INLINE u16 maxv_u8() const { return vmaxvq_u8(vreinterpretq_u8_s32(v4s)); }
1353
1354
ALWAYS_INLINE u16 minv_u16() const { return vminvq_u16(vreinterpretq_u16_s32(v4s)); }
1355
1356
ALWAYS_INLINE u16 maxv_u16() const { return vmaxvq_u16(vreinterpretq_u16_s32(v4s)); }
1357
1358
ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); }
1359
1360
ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(v4s); }
1361
1362
ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); }
1363
1364
ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }
1365
1366
#else
1367
1368
ALWAYS_INLINE u8 minv_u8() const
1369
{
1370
uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1371
vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1)));
1372
return static_cast<u8>(
1373
std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
1374
std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
1375
std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
1376
}
1377
1378
ALWAYS_INLINE u16 maxv_u8() const
1379
{
1380
uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1381
vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1)));
1382
return static_cast<u8>(
1383
std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
1384
std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
1385
std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
1386
}
1387
1388
ALWAYS_INLINE u16 minv_u16() const
1389
{
1390
uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1391
vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1)));
1392
return static_cast<u16>(
1393
std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
1394
}
1395
1396
ALWAYS_INLINE u16 maxv_u16() const
1397
{
1398
uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1399
vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1)));
1400
return static_cast<u16>(
1401
std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
1402
}
1403
1404
ALWAYS_INLINE s32 minv_s32() const
1405
{
1406
int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1407
return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1));
1408
}
1409
1410
ALWAYS_INLINE u32 minv_u32() const
1411
{
1412
uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1413
return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1));
1414
}
1415
1416
ALWAYS_INLINE s32 maxv_s32() const
1417
{
1418
int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1419
return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1));
1420
}
1421
1422
ALWAYS_INLINE u32 maxv_u32() const
1423
{
1424
uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1425
return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1));
1426
}
1427
1428
#endif
1429
1430
ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
1431
1432
ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
1433
{
1434
uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7));
1435
return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s))));
1436
}
1437
1438
template<int mask>
1439
ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const
1440
{
1441
return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(
1442
vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(a.v4s), ((mask & 0x01) == 0) ? 0 : 8,
1443
((mask & 0x02) == 0) ? 1 : 9, ((mask & 0x04) == 0) ? 2 : 10, ((mask & 0x08) == 0) ? 3 : 11,
1444
((mask & 0x10) == 0) ? 4 : 12, ((mask & 0x20) == 0) ? 5 : 13, ((mask & 0x40) == 0) ? 6 : 14,
1445
((mask & 0x80) == 0) ? 7 : 15)));
1446
}
1447
1448
template<int mask>
1449
ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
1450
{
1451
return GSVector4i(__builtin_shufflevector(v4s, v.v4s, ((mask & 1) == 0) ? 0 : 4, ((mask & 2) == 0) ? 1 : 5,
1452
((mask & 4) == 0) ? 2 : 6, ((mask & 8) == 0) ? 3 : 7));
1453
}
1454
1455
ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
1456
{
1457
return GSVector4i(
1458
vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)),
1459
vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
1460
}
1461
1462
ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
1463
{
1464
#ifdef CPU_ARCH_ARM64
1465
return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
1466
#else
1467
int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))};
1468
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))),
1469
vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s))))));
1470
#endif
1471
}
1472
1473
ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
1474
{
1475
return GSVector4i(vreinterpretq_s32_s8(
1476
vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s)))));
1477
}
1478
1479
ALWAYS_INLINE GSVector4i ps16() const
1480
{
1481
return GSVector4i(vreinterpretq_s32_s8(
1482
vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s)))));
1483
}
1484
1485
ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const
1486
{
1487
return GSVector4i(vreinterpretq_s32_u8(
1488
vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s)))));
1489
}
1490
1491
ALWAYS_INLINE GSVector4i pu16() const
1492
{
1493
return GSVector4i(vreinterpretq_s32_u8(
1494
vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s)))));
1495
}
1496
1497
ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const
1498
{
1499
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s))));
1500
}
1501
1502
ALWAYS_INLINE GSVector4i ps32() const
1503
{
1504
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s))));
1505
}
1506
1507
ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
1508
{
1509
return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s))));
1510
}
1511
1512
ALWAYS_INLINE GSVector4i pu32() const
1513
{
1514
return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
1515
}
1516
1517
#ifdef CPU_ARCH_ARM64
1518
1519
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1520
{
1521
return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1522
}
1523
1524
ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1525
{
1526
return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1527
}
1528
1529
ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1530
{
1531
return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1532
}
1533
1534
ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1535
{
1536
return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1537
}
1538
1539
ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); }
1540
1541
ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); }
1542
1543
ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1544
{
1545
return GSVector4i(vreinterpretq_s32_s64(
1546
vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1547
}
1548
1549
ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1550
{
1551
return GSVector4i(vreinterpretq_s32_s64(
1552
vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1553
}
1554
1555
ALWAYS_INLINE GSVector4i upl8() const
1556
{
1557
return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1558
}
1559
1560
ALWAYS_INLINE GSVector4i uph8() const
1561
{
1562
return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1563
}
1564
1565
ALWAYS_INLINE GSVector4i upl16() const
1566
{
1567
return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1568
}
1569
1570
ALWAYS_INLINE GSVector4i uph16() const
1571
{
1572
return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1573
}
1574
1575
ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); }
1576
1577
ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); }
1578
1579
ALWAYS_INLINE GSVector4i upl64() const
1580
{
1581
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1582
}
1583
1584
ALWAYS_INLINE GSVector4i uph64() const
1585
{
1586
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1587
}
1588
1589
#else
1590
1591
ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1592
{
1593
const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
1594
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1595
}
1596
1597
ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1598
{
1599
const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
1600
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1601
}
1602
1603
ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1604
{
1605
const int16x4x2_t res =
1606
vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1607
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1608
}
1609
1610
ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1611
{
1612
const int16x4x2_t res =
1613
vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1614
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1615
}
1616
1617
ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const
1618
{
1619
const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s));
1620
return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1621
}
1622
1623
ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const
1624
{
1625
const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s));
1626
return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1627
}
1628
1629
ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1630
{
1631
return GSVector4i(vreinterpretq_s32_s64(
1632
vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1633
}
1634
1635
ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1636
{
1637
return GSVector4i(vreinterpretq_s32_s64(
1638
vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1639
}
1640
1641
ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); }
1642
1643
ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); }
1644
1645
ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); }
1646
1647
ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); }
1648
1649
ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); }
1650
1651
ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); }
1652
1653
ALWAYS_INLINE GSVector4i upl64() const
1654
{
1655
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1656
}
1657
1658
ALWAYS_INLINE GSVector4i uph64() const
1659
{
1660
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1661
}
1662
#endif
1663
1664
ALWAYS_INLINE GSVector4i s8to16() const
1665
{
1666
return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
1667
}
1668
1669
ALWAYS_INLINE GSVector4i u8to16() const
1670
{
1671
return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))));
1672
}
1673
1674
ALWAYS_INLINE GSVector4i s8to32() const
1675
{
1676
return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))));
1677
}
1678
1679
ALWAYS_INLINE GSVector4i u8to32() const
1680
{
1681
return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))));
1682
}
1683
1684
ALWAYS_INLINE GSVector4i s8to64() const
1685
{
1686
return GSVector4i(vreinterpretq_s32_s64(
1687
vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))))));
1688
}
1689
1690
ALWAYS_INLINE GSVector4i u8to64() const
1691
{
1692
return GSVector4i(vreinterpretq_s32_u64(
1693
vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))))));
1694
}
1695
1696
ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); }
1697
1698
ALWAYS_INLINE GSVector4i u16to32() const
1699
{
1700
return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))));
1701
}
1702
1703
ALWAYS_INLINE GSVector4i s16to64() const
1704
{
1705
return GSVector4i(
1706
vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))))));
1707
}
1708
1709
ALWAYS_INLINE GSVector4i u16to64() const
1710
{
1711
return GSVector4i(
1712
vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))))));
1713
}
1714
1715
ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); }
1716
1717
ALWAYS_INLINE GSVector4i u32to64() const
1718
{
1719
return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)))));
1720
}
1721
1722
template<int i>
1723
ALWAYS_INLINE GSVector4i srl() const
1724
{
1725
return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i)));
1726
}
1727
1728
template<int i>
1729
ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
1730
{
1731
if constexpr (i >= 16)
1732
return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16)));
1733
else
1734
return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i)));
1735
}
1736
1737
template<int i>
1738
ALWAYS_INLINE GSVector4i sll() const
1739
{
1740
return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i)));
1741
}
1742
1743
template<int i>
1744
ALWAYS_INLINE GSVector4i sll16() const
1745
{
1746
return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
1747
}
1748
1749
ALWAYS_INLINE GSVector4i sll16(s32 i) const
1750
{
1751
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
1752
}
1753
1754
ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
1755
{
1756
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1757
}
1758
1759
template<int i>
1760
ALWAYS_INLINE GSVector4i srl16() const
1761
{
1762
return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i)));
1763
}
1764
1765
ALWAYS_INLINE GSVector4i srl16(s32 i) const
1766
{
1767
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i))));
1768
}
1769
1770
ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
1771
{
1772
return GSVector4i(
1773
vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1774
}
1775
1776
template<int i>
1777
ALWAYS_INLINE GSVector4i sra16() const
1778
{
1779
constexpr int count = (i & ~15) ? 15 : i;
1780
return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count)));
1781
}
1782
1783
ALWAYS_INLINE GSVector4i sra16(s32 i) const
1784
{
1785
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i))));
1786
}
1787
1788
ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const
1789
{
1790
return GSVector4i(
1791
vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1792
}
1793
1794
template<int i>
1795
ALWAYS_INLINE GSVector4i sll32() const
1796
{
1797
return GSVector4i(vshlq_n_s32(v4s, i));
1798
}
1799
1800
ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
1801
1802
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
1803
1804
template<int i>
1805
ALWAYS_INLINE GSVector4i srl32() const
1806
{
1807
return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i)));
1808
}
1809
1810
ALWAYS_INLINE GSVector4i srl32(s32 i) const
1811
{
1812
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i))));
1813
}
1814
1815
ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const
1816
{
1817
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))));
1818
}
1819
1820
template<int i>
1821
ALWAYS_INLINE GSVector4i sra32() const
1822
{
1823
return GSVector4i(vshrq_n_s32(v4s, i));
1824
}
1825
1826
ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); }
1827
1828
ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const
1829
{
1830
return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)));
1831
}
1832
1833
template<int i>
1834
ALWAYS_INLINE GSVector4i sll64() const
1835
{
1836
return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
1837
}
1838
1839
ALWAYS_INLINE GSVector4i sll64(s32 i) const
1840
{
1841
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i))));
1842
}
1843
1844
ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
1845
{
1846
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
1847
}
1848
1849
template<int i>
1850
ALWAYS_INLINE GSVector4i srl64() const
1851
{
1852
return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i)));
1853
}
1854
1855
ALWAYS_INLINE GSVector4i srl64(s32 i) const
1856
{
1857
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
1858
}
1859
1860
#ifdef CPU_ARCH_ARM64
1861
ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
1862
{
1863
return GSVector4i(
1864
vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
1865
}
1866
#endif
1867
1868
ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
1869
{
1870
return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1871
}
1872
1873
ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const
1874
{
1875
return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1876
}
1877
1878
ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); }
1879
1880
ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const
1881
{
1882
return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1883
}
1884
1885
ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const
1886
{
1887
return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1888
}
1889
1890
ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const
1891
{
1892
// can't use vpaddq_s16() here, because we need saturation.
1893
// return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1894
const int16x8_t a = vreinterpretq_s16_s32(v4s);
1895
const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
1896
#ifdef CPU_ARCH_ARM64
1897
return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
1898
#else
1899
// sse2neon again
1900
int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
1901
int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
1902
return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357)));
1903
#endif
1904
}
1905
1906
ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
1907
{
1908
return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1909
}
1910
1911
ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const
1912
{
1913
return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1914
}
1915
1916
ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const
1917
{
1918
return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1919
}
1920
1921
ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const
1922
{
1923
return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1924
}
1925
1926
ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); }
1927
1928
ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const
1929
{
1930
return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1931
}
1932
1933
ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const
1934
{
1935
return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1936
}
1937
1938
ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const
1939
{
1940
return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1941
}
1942
1943
ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const
1944
{
1945
return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1946
}
1947
1948
ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const
1949
{
1950
return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1951
}
1952
1953
ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const
1954
{
1955
return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1956
}
1957
1958
ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const
1959
{
1960
// from sse2neon
1961
int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s));
1962
int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s));
1963
int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
1964
int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s));
1965
int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s));
1966
int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
1967
uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
1968
return GSVector4i(vreinterpretq_s32_u16(r.val[1]));
1969
}
1970
1971
ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const
1972
{
1973
return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1974
}
1975
1976
ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const
1977
{
1978
int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1979
int32x4_t mul_hi =
1980
vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1981
int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
1982
int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
1983
return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi)));
1984
}
1985
1986
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
1987
1988
ALWAYS_INLINE bool eq(const GSVector4i& v) const
1989
{
1990
const int32x4_t res = veorq_s32(v4s, v.v4s);
1991
#ifdef CPU_ARCH_ARM64
1992
return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0);
1993
#else
1994
const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res));
1995
return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0);
1996
#endif
1997
}
1998
1999
ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
2000
{
2001
return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2002
}
2003
2004
ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const
2005
{
2006
return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2007
}
2008
2009
ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const
2010
{
2011
return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
2012
}
2013
2014
#ifdef CPU_ARCH_ARM64
2015
ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
2016
{
2017
return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
2018
}
2019
#endif
2020
2021
ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
2022
2023
ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
2024
2025
ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
2026
2027
ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const
2028
{
2029
return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2030
}
2031
2032
ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const
2033
{
2034
return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2035
}
2036
2037
ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(vcgtq_s32(v4s, v.v4s)); }
2038
2039
ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const
2040
{
2041
return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2042
}
2043
ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const
2044
{
2045
return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2046
}
2047
ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return GSVector4i(vcgeq_s32(v4s, v.v4s)); }
2048
2049
ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const
2050
{
2051
return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2052
}
2053
2054
ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const
2055
{
2056
return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2057
}
2058
2059
ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(vcltq_s32(v4s, v.v4s)); }
2060
2061
ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const
2062
{
2063
return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2064
}
2065
ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const
2066
{
2067
return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2068
}
2069
ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return GSVector4i(vcleq_s32(v4s, v.v4s)); }
2070
2071
ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); }
2072
2073
ALWAYS_INLINE int mask() const
2074
{
2075
// borrowed from sse2neon
2076
const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7));
2077
const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2078
const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2079
const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2080
return static_cast<int>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
2081
}
2082
2083
ALWAYS_INLINE bool alltrue() const
2084
{
2085
#ifdef CPU_ARCH_ARM64
2086
return (vminvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0xFFFFFFFF));
2087
#else
2088
return (vget_lane_u64(vreinterpret_u64_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) ==
2089
UINT64_C(0xFFFFFFFFFFFFFFFF));
2090
#endif
2091
}
2092
2093
ALWAYS_INLINE bool allfalse() const
2094
{
2095
#ifdef CPU_ARCH_ARM64
2096
return (vmaxvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0));
2097
#else
2098
return (vget_lane_u64(vreinterpret_u64_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == UINT64_C(0));
2099
#endif
2100
}
2101
2102
template<int i>
2103
ALWAYS_INLINE GSVector4i insert8(int a) const
2104
{
2105
return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast<uint8_t>(i))));
2106
}
2107
2108
template<int i>
2109
ALWAYS_INLINE int extract8() const
2110
{
2111
return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i);
2112
}
2113
2114
template<int i>
2115
ALWAYS_INLINE GSVector4i insert16(int a) const
2116
{
2117
return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast<uint16_t>(i))));
2118
}
2119
2120
template<int i>
2121
ALWAYS_INLINE int extract16() const
2122
{
2123
return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i);
2124
}
2125
2126
template<int i>
2127
ALWAYS_INLINE GSVector4i insert32(int a) const
2128
{
2129
return GSVector4i(vsetq_lane_s32(a, v4s, i));
2130
}
2131
2132
template<int i>
2133
ALWAYS_INLINE int extract32() const
2134
{
2135
return vgetq_lane_s32(v4s, i);
2136
}
2137
2138
template<int i>
2139
ALWAYS_INLINE GSVector4i insert64(s64 a) const
2140
{
2141
return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i)));
2142
}
2143
2144
template<int i>
2145
ALWAYS_INLINE s64 extract64() const
2146
{
2147
return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i);
2148
}
2149
2150
#ifdef CPU_ARCH_ARM64
2151
ALWAYS_INLINE GSVector4i tbl2(const GSVector4i& a, const GSVector4i& b, const GSVector4i& idx)
2152
{
2153
return GSVector4i(vreinterpretq_s32_u8(
2154
vqtbx2q_u8(vreinterpretq_u8_s32(v4s), uint8x16x2_t{vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(b.v4s)},
2155
vreinterpretq_u8_s32(idx.v4s))));
2156
}
2157
#endif
2158
2159
ALWAYS_INLINE static GSVector4i loadnt(const void* p)
2160
{
2161
#if __has_builtin(__builtin_nontemporal_store)
2162
return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p));
2163
#else
2164
return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2165
#endif
2166
}
2167
2168
ALWAYS_INLINE static GSVector4i load32(const void* p)
2169
{
2170
// should be ldr s0, [x0]
2171
u32 val;
2172
std::memcpy(&val, p, sizeof(u32));
2173
return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0));
2174
}
2175
2176
ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }
2177
2178
template<bool aligned>
2179
ALWAYS_INLINE static GSVector4i loadl(const void* p)
2180
{
2181
#ifdef CPU_ARCH_ARM32
2182
if constexpr (!aligned)
2183
return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
2184
#endif
2185
2186
return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
2187
}
2188
2189
ALWAYS_INLINE static GSVector4i loadl(const GSVector2i& v) { return GSVector4i(vcombine_s32(v.v2s, vcreate_s32(0))); }
2190
2191
template<bool aligned>
2192
ALWAYS_INLINE static GSVector4i loadh(const void* p)
2193
{
2194
#ifdef CPU_ARCH_ARM32
2195
if constexpr (!aligned)
2196
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
2197
#endif
2198
2199
return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
2200
}
2201
2202
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(vcombine_s32(vcreate_s32(0), v.v2s)); }
2203
2204
template<bool aligned>
2205
ALWAYS_INLINE static GSVector4i load(const void* p)
2206
{
2207
#ifdef CPU_ARCH_ARM32
2208
if constexpr (!aligned)
2209
return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
2210
#endif
2211
2212
return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2213
}
2214
2215
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v)
2216
{
2217
#if __has_builtin(__builtin_nontemporal_store)
2218
__builtin_nontemporal_store(v.v4s, static_cast<int32x4_t*>(p));
2219
#else
2220
vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2221
#endif
2222
}
2223
2224
ALWAYS_INLINE static void store32(void* p, const GSVector4i& v)
2225
{
2226
u32 val = vgetq_lane_s32(v, 0);
2227
std::memcpy(p, &val, sizeof(u32));
2228
}
2229
2230
template<bool aligned>
2231
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
2232
{
2233
#ifdef CPU_ARCH_ARM32
2234
if constexpr (!aligned)
2235
{
2236
vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
2237
return;
2238
}
2239
#endif
2240
2241
vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
2242
}
2243
2244
template<bool aligned>
2245
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
2246
{
2247
#ifdef CPU_ARCH_ARM32
2248
if constexpr (!aligned)
2249
{
2250
vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
2251
return;
2252
}
2253
#endif
2254
2255
vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
2256
}
2257
2258
template<bool aligned>
2259
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
2260
{
2261
#ifdef CPU_ARCH_ARM32
2262
if constexpr (!aligned)
2263
{
2264
vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
2265
return;
2266
}
2267
#endif
2268
2269
vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2270
}
2271
2272
ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; }
2273
2274
template<bool aligned>
2275
ALWAYS_INLINE static GSVector4i broadcast128(const void* v)
2276
{
2277
return load<aligned>(v);
2278
}
2279
2280
ALWAYS_INLINE void operator&=(const GSVector4i& v)
2281
{
2282
v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2283
}
2284
2285
ALWAYS_INLINE void operator|=(const GSVector4i& v)
2286
{
2287
v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2288
}
2289
2290
ALWAYS_INLINE void operator^=(const GSVector4i& v)
2291
{
2292
v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2293
}
2294
2295
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
2296
{
2297
return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2298
}
2299
2300
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
2301
{
2302
return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2303
}
2304
2305
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
2306
{
2307
return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2308
}
2309
2310
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); }
2311
2312
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); }
2313
2314
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); }
2315
2316
ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); }
2317
2318
ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); }
2319
2320
ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); }
2321
2322
ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
2323
2324
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw)
2325
{
2326
return GSVector4i(vcombine_s32(xy.v2s, zw.v2s));
2327
}
2328
2329
ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xyzw) { return GSVector4i(vcombine_s32(xyzw.v2s, xyzw.v2s)); }
2330
2331
static GSVector4i rfit(const GSVector4i& fit_rect, const GSVector2i& image_size);
2332
2333
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(vget_low_s32(v4s)); }
2334
2335
ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); }
2336
2337
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
2338
ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const \
2339
{ \
2340
return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \
2341
} \
2342
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const \
2343
{ \
2344
return GSVector4i(vreinterpretq_s32_s16( \
2345
__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), xn, yn, zn, wn, 4, 5, 6, 7))); \
2346
} \
2347
ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const \
2348
{ \
2349
return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector( \
2350
vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 4 + xn, 4 + yn, 4 + zn, 4 + wn))); \
2351
}
2352
2353
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
2354
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
2355
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
2356
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
2357
VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
2358
2359
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
2360
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
2361
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
2362
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
2363
VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
2364
2365
#define VECTOR4i_SHUFFLE_1(xs, xn) \
2366
VECTOR4i_SHUFFLE_2(xs, xn, x, 0); \
2367
VECTOR4i_SHUFFLE_2(xs, xn, y, 1); \
2368
VECTOR4i_SHUFFLE_2(xs, xn, z, 2); \
2369
VECTOR4i_SHUFFLE_2(xs, xn, w, 3);
2370
2371
VECTOR4i_SHUFFLE_1(x, 0);
2372
VECTOR4i_SHUFFLE_1(y, 1);
2373
VECTOR4i_SHUFFLE_1(z, 2);
2374
VECTOR4i_SHUFFLE_1(w, 3);
2375
2376
#undef VECTOR4i_SHUFFLE_1
2377
#undef VECTOR4i_SHUFFLE_2
2378
#undef VECTOR4i_SHUFFLE_3
2379
#undef VECTOR4i_SHUFFLE_4
2380
};
2381
2382
class alignas(16) GSVector4
2383
{
2384
struct cxpr_init_tag
2385
{
2386
};
2387
static constexpr cxpr_init_tag cxpr_init{};
2388
2389
constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
2390
2391
constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
2392
2393
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
2394
2395
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
2396
2397
public:
2398
union
2399
{
2400
struct
2401
{
2402
float x, y, z, w;
2403
};
2404
struct
2405
{
2406
float r, g, b, a;
2407
};
2408
struct
2409
{
2410
float left, top, right, bottom;
2411
};
2412
float F32[4];
2413
double F64[2];
2414
s8 I8[16];
2415
s16 I16[8];
2416
s32 I32[4];
2417
s64 I64[2];
2418
u8 U8[16];
2419
u16 U16[8];
2420
u32 U32[4];
2421
u64 U64[2];
2422
float32x4_t v4s;
2423
};
2424
2425
GSVector4() = default;
2426
2427
constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
2428
constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
2429
2430
constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
2431
constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
2432
2433
constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
2434
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
2435
2436
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
2437
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
2438
2439
constexpr static GSVector4 cxpr_rgba32(u32 rgba)
2440
{
2441
return GSVector4(cxpr_init, static_cast<float>(rgba & 0xff), static_cast<float>((rgba >> 8) & 0xff),
2442
static_cast<float>((rgba >> 16) & 0xff), static_cast<float>((rgba >> 24) & 0xff));
2443
}
2444
2445
constexpr static GSVector4 cxpr_unorm8(u32 rgba)
2446
{
2447
return GSVector4(cxpr_init, static_cast<float>(rgba & 0xff) / 255.0f,
2448
static_cast<float>((rgba >> 8) & 0xff) / 255.0f, static_cast<float>((rgba >> 16) & 0xff) / 255.0f,
2449
static_cast<float>((rgba >> 24) & 0xff) / 255.0f);
2450
}
2451
2452
ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
2453
{
2454
const float arr[4] = {x, y, z, w};
2455
v4s = vld1q_f32(arr);
2456
}
2457
2458
ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); }
2459
2460
ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
2461
{
2462
const int arr[4] = {x, y, z, w};
2463
v4s = vcvtq_f32_s32(vld1q_s32(arr));
2464
}
2465
2466
ALWAYS_INLINE GSVector4(int x, int y)
2467
{
2468
v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0));
2469
}
2470
2471
ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); }
2472
2473
ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { v4s = vcombine_f32(vcvt_f32_s32(v.v2s), vcreate_f32(0)); }
2474
2475
ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {}
2476
2477
ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); }
2478
2479
ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); }
2480
2481
ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
2482
2483
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
2484
2485
ALWAYS_INLINE static GSVector4 f64(double x, double y)
2486
{
2487
#ifdef CPU_ARCH_ARM64
2488
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
2489
#else
2490
GSVector4 ret;
2491
ret.F64[0] = x;
2492
ret.F64[1] = y;
2493
return ret;
2494
#endif
2495
}
2496
2497
ALWAYS_INLINE static GSVector4 f64(double x)
2498
{
2499
#ifdef CPU_ARCH_ARM64
2500
return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x)));
2501
#else
2502
GSVector4 ret;
2503
ret.F64[0] = ret.F64[1] = x;
2504
return ret;
2505
#endif
2506
}
2507
2508
ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
2509
2510
ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; }
2511
2512
ALWAYS_INLINE operator float32x4_t() const { return v4s; }
2513
2514
ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
2515
2516
ALWAYS_INLINE static GSVector4 rgba32(u32 rgba)
2517
{
2518
return GSVector4(GSVector4i::zext32(static_cast<s32>(rgba)).u8to32());
2519
}
2520
2521
ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
2522
2523
ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); }
2524
2525
ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
2526
2527
#ifdef _M_ARM64
2528
2529
ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
2530
2531
ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
2532
2533
#else
2534
2535
ALWAYS_INLINE GSVector4 floor() const
2536
{
2537
return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)),
2538
std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3)));
2539
}
2540
2541
ALWAYS_INLINE GSVector4 ceil() const
2542
{
2543
return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)),
2544
std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3)));
2545
}
2546
2547
#endif
2548
2549
#ifdef CPU_ARCH_ARM64
2550
2551
ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
2552
2553
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
2554
2555
ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); }
2556
2557
ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2558
{
2559
return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
2560
}
2561
2562
#else
2563
2564
ALWAYS_INLINE GSVector4 hadd() const
2565
{
2566
const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2567
return GSVector4(vcombine_f32(res, res));
2568
}
2569
2570
ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const
2571
{
2572
const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2573
const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s));
2574
return GSVector4(vcombine_f32(res1, res2));
2575
}
2576
2577
ALWAYS_INLINE GSVector4 hsub() const
2578
{
2579
const float32x4x2_t res = vuzpq_f32(v4s, v4s);
2580
return GSVector4(vsubq_f32(res.val[0], res.val[0]));
2581
}
2582
2583
ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2584
{
2585
const float32x4x2_t res = vuzpq_f32(v4s, v.v4s);
2586
return GSVector4(vsubq_f32(res.val[0], res.val[1]));
2587
}
2588
2589
#endif
2590
2591
ALWAYS_INLINE float dot(const GSVector4& v) const
2592
{
2593
#ifdef CPU_ARCH_ARM64
2594
return vaddvq_f32(vmulq_f32(v4s, v.v4s));
2595
#else
2596
const float32x4_t dp = vmulq_f32(v4s, v.v4s);
2597
float32x2_t tmp = vadd_f32(vget_low_f32(dp), vget_high_f32(dp)); // (x+z, y+w)
2598
return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2599
#endif
2600
}
2601
2602
ALWAYS_INLINE float addv() const
2603
{
2604
#ifdef CPU_ARCH_ARM64
2605
return vaddvq_f32(v4s);
2606
#else
2607
float32x2_t tmp = vadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2608
return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2609
#endif
2610
}
2611
2612
ALWAYS_INLINE float minv() const
2613
{
2614
#ifdef CPU_ARCH_ARM64
2615
return vminvq_f32(v4s);
2616
#else
2617
float32x2_t tmp = vmin_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2618
return vget_lane_f32(vmin_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2619
#endif
2620
}
2621
2622
ALWAYS_INLINE float maxv() const
2623
{
2624
#ifdef CPU_ARCH_ARM64
2625
return vmaxvq_f32(v4s);
2626
#else
2627
float32x2_t tmp = vmax_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2628
return vget_lane_f32(vmax_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2629
#endif
2630
}
2631
2632
ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
2633
2634
ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
2635
{
2636
#ifdef CPU_ARCH_ARM64
2637
const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
2638
const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
2639
#else
2640
const GSVector4 minv(a.xyxy());
2641
const GSVector4 maxv(a.zwzw());
2642
#endif
2643
return sat(minv, maxv);
2644
}
2645
2646
ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
2647
2648
ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
2649
2650
ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); }
2651
2652
ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); }
2653
2654
template<int mask>
2655
ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const
2656
{
2657
return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2,
2658
(mask & 8) ? 7 : 3));
2659
}
2660
2661
ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
2662
{
2663
// duplicate sign bit across and bit select
2664
const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
2665
return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
2666
}
2667
2668
#ifdef CPU_ARCH_ARM64
2669
2670
ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
2671
2672
ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
2673
2674
ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2675
{
2676
return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2677
}
2678
2679
ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2680
{
2681
return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2682
}
2683
2684
#else
2685
2686
ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const
2687
{
2688
const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s));
2689
return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2690
}
2691
2692
ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const
2693
{
2694
const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s));
2695
return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2696
}
2697
2698
ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2699
{
2700
return GSVector4(vreinterpretq_f32_s64(
2701
vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s)))));
2702
}
2703
2704
ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2705
{
2706
return GSVector4(vreinterpretq_f32_s64(
2707
vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s)))));
2708
}
2709
2710
#endif
2711
2712
ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
2713
{
2714
return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
2715
}
2716
2717
ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
2718
{
2719
return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
2720
}
2721
2722
ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
2723
{
2724
return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
2725
}
2726
2727
ALWAYS_INLINE int mask() const
2728
{
2729
#ifdef CPU_ARCH_ARM64
2730
static constexpr const int32_t shifts[] = {0, 1, 2, 3};
2731
return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
2732
#else
2733
// sse2neon again
2734
uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31));
2735
uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2736
return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2737
#endif
2738
}
2739
2740
ALWAYS_INLINE bool alltrue() const
2741
{
2742
#ifdef CPU_ARCH_ARM64
2743
return (vminvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0xFFFFFFFF));
2744
#else
2745
2746
return (vget_lane_u64(vreinterpret_u64_u32(vand_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2747
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2748
0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
2749
#endif
2750
}
2751
2752
ALWAYS_INLINE bool allfalse() const
2753
{
2754
#ifdef CPU_ARCH_ARM64
2755
return (vmaxvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0));
2756
#else
2757
return (vget_lane_u64(vreinterpret_u64_u32(vorr_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2758
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2759
0) == UINT64_C(0));
2760
#endif
2761
}
2762
2763
ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
2764
2765
template<int src, int dst>
2766
ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
2767
{
2768
#ifdef CPU_ARCH_ARM64
2769
return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
2770
#else
2771
return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst));
2772
#endif
2773
}
2774
2775
template<int i>
2776
ALWAYS_INLINE GSVector4 insert32(float v) const
2777
{
2778
return GSVector4(vsetq_lane_f32(v, v4s, i));
2779
}
2780
2781
template<int i>
2782
ALWAYS_INLINE float extract32() const
2783
{
2784
return vgetq_lane_f32(v4s, i);
2785
}
2786
2787
template<int dst>
2788
ALWAYS_INLINE GSVector4 insert64(double v) const
2789
{
2790
#ifdef CPU_ARCH_ARM64
2791
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst)));
2792
#else
2793
GSVector4 ret;
2794
ret.F64[dst] = v;
2795
return ret;
2796
#endif
2797
}
2798
2799
template<int src>
2800
ALWAYS_INLINE double extract64() const
2801
{
2802
#ifdef CPU_ARCH_ARM64
2803
return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src);
2804
#else
2805
return F64[src];
2806
#endif
2807
}
2808
2809
ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
2810
2811
ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
2812
2813
template<bool aligned>
2814
ALWAYS_INLINE static GSVector4 loadl(const void* p)
2815
{
2816
#ifdef CPU_ARCH_ARM32
2817
if constexpr (!aligned)
2818
return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
2819
#endif
2820
2821
return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
2822
}
2823
2824
ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); }
2825
2826
template<bool aligned>
2827
ALWAYS_INLINE static GSVector4 load(const void* p)
2828
{
2829
#ifdef CPU_ARCH_ARM32
2830
if constexpr (!aligned)
2831
return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
2832
#endif
2833
2834
return GSVector4(vld1q_f32((const float*)p));
2835
}
2836
2837
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
2838
2839
template<bool aligned>
2840
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
2841
{
2842
#ifdef CPU_ARCH_ARM32
2843
if constexpr (!aligned)
2844
{
2845
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
2846
return;
2847
}
2848
#endif
2849
2850
vst1_f32((float*)p, vget_low_f32(v.v4s));
2851
}
2852
2853
template<bool aligned>
2854
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
2855
{
2856
#ifdef CPU_ARCH_ARM32
2857
if constexpr (!aligned)
2858
{
2859
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
2860
return;
2861
}
2862
#endif
2863
2864
vst1_f32((float*)p, vget_high_f32(v.v4s));
2865
}
2866
2867
template<bool aligned>
2868
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
2869
{
2870
#ifdef CPU_ARCH_ARM32
2871
if constexpr (!aligned)
2872
{
2873
vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
2874
return;
2875
}
2876
#endif
2877
2878
vst1q_f32((float*)p, v.v4s);
2879
}
2880
2881
ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); }
2882
2883
ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
2884
2885
ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
2886
ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
2887
ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
2888
ALWAYS_INLINE void operator/=(const GSVector4& v)
2889
{
2890
#ifdef CPU_ARCH_ARM64
2891
v4s = vdivq_f32(v4s, v.v4s);
2892
#else
2893
*this =
2894
GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1),
2895
vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3));
2896
#endif
2897
}
2898
2899
ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
2900
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
2901
ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
2902
ALWAYS_INLINE void operator/=(float f)
2903
{
2904
#ifdef CPU_ARCH_ARM64
2905
*this /= GSVector4(f);
2906
#else
2907
*this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f,
2908
vgetq_lane_f32(v4s, 3) / f);
2909
#endif
2910
}
2911
2912
ALWAYS_INLINE void operator&=(const GSVector4& v)
2913
{
2914
v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2915
}
2916
2917
ALWAYS_INLINE void operator|=(const GSVector4& v)
2918
{
2919
v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2920
}
2921
2922
ALWAYS_INLINE void operator^=(const GSVector4& v)
2923
{
2924
v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2925
}
2926
2927
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
2928
{
2929
return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
2930
}
2931
2932
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
2933
{
2934
return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
2935
}
2936
2937
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
2938
{
2939
return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
2940
}
2941
2942
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
2943
{
2944
#ifdef CPU_ARCH_ARM64
2945
return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
2946
#else
2947
return GSVector4(
2948
vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1),
2949
vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3));
2950
#endif
2951
}
2952
2953
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
2954
ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
2955
ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
2956
ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f)
2957
{
2958
#ifdef CPU_ARCH_ARM64
2959
return v / GSVector4(f);
2960
#else
2961
return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f,
2962
vgetq_lane_f32(v.v4s, 3) / f);
2963
#endif
2964
}
2965
2966
ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
2967
{
2968
return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2969
}
2970
2971
ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
2972
{
2973
return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2974
}
2975
2976
ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
2977
{
2978
return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2979
}
2980
2981
ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
2982
{
2983
return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
2984
}
2985
2986
ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
2987
{
2988
// NEON has no !=
2989
return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
2990
}
2991
2992
ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
2993
{
2994
return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
2995
}
2996
2997
ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
2998
{
2999
return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
3000
}
3001
3002
ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
3003
{
3004
return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
3005
}
3006
3007
ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
3008
{
3009
return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
3010
}
3011
3012
ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
3013
{
3014
#ifdef CPU_ARCH_ARM64
3015
return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3016
#else
3017
return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]);
3018
#endif
3019
}
3020
3021
ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
3022
{
3023
#ifdef CPU_ARCH_ARM64
3024
return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3025
#else
3026
return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]);
3027
#endif
3028
}
3029
3030
ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
3031
{
3032
#ifdef CPU_ARCH_ARM64
3033
return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3034
#else
3035
return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]);
3036
#endif
3037
}
3038
3039
ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
3040
{
3041
#ifdef CPU_ARCH_ARM64
3042
return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3043
#else
3044
return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
3045
#endif
3046
}
3047
3048
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
3049
{
3050
#ifdef CPU_ARCH_ARM64
3051
return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3052
#else
3053
GSVector4 ret;
3054
ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3055
ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3056
return ret;
3057
#endif
3058
}
3059
3060
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
3061
{
3062
#ifdef CPU_ARCH_ARM64
3063
return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3064
#else
3065
GSVector4 ret;
3066
ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3067
ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3068
return ret;
3069
#endif
3070
}
3071
3072
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
3073
{
3074
#ifdef CPU_ARCH_ARM64
3075
return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3076
#else
3077
GSVector4 ret;
3078
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3079
ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3080
return ret;
3081
#endif
3082
}
3083
3084
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
3085
{
3086
#ifdef CPU_ARCH_ARM64
3087
return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3088
#else
3089
GSVector4 ret;
3090
ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3091
ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3092
return ret;
3093
#endif
3094
}
3095
3096
ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
3097
{
3098
#ifdef CPU_ARCH_ARM64
3099
return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3100
#else
3101
GSVector4 ret;
3102
ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3103
ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3104
return ret;
3105
#endif
3106
}
3107
3108
ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
3109
{
3110
#ifdef CPU_ARCH_ARM64
3111
return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3112
#else
3113
return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
3114
#endif
3115
}
3116
3117
ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
3118
{
3119
#ifdef CPU_ARCH_ARM64
3120
return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3121
#else
3122
return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
3123
#endif
3124
}
3125
3126
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
3127
3128
ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
3129
3130
ALWAYS_INLINE GSVector4 sqrt64() const
3131
{
3132
#ifdef CPU_ARCH_ARM64
3133
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3134
#else
3135
return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1]));
3136
#endif
3137
}
3138
3139
ALWAYS_INLINE GSVector4 sqr64() const
3140
{
3141
#ifdef CPU_ARCH_ARM64
3142
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3143
#else
3144
return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
3145
#endif
3146
}
3147
3148
ALWAYS_INLINE GSVector4 floor64() const
3149
{
3150
#ifdef CPU_ARCH_ARM64
3151
return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s))));
3152
#else
3153
return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1]));
3154
#endif
3155
}
3156
3157
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
3158
{
3159
#ifdef CPU_ARCH_ARM64
3160
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
3161
#else
3162
return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1)));
3163
#endif
3164
}
3165
3166
ALWAYS_INLINE static GSVector4 f32to64(const void* p)
3167
{
3168
#ifdef CPU_ARCH_ARM64
3169
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
3170
#else
3171
const float* fp = static_cast<const float*>(p);
3172
return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1]));
3173
#endif
3174
}
3175
3176
ALWAYS_INLINE GSVector4i f64toi32() const
3177
{
3178
#ifdef CPU_ARCH_ARM64
3179
const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
3180
const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
3181
#else
3182
const s32 low = static_cast<s32>(F64[0]);
3183
const s32 high = static_cast<s32>(F64[1]);
3184
#endif
3185
return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
3186
}
3187
3188
ALWAYS_INLINE GSVector2 xy() const { return GSVector2(vget_low_s32(v4s)); }
3189
3190
ALWAYS_INLINE GSVector2 zw() const { return GSVector2(vget_high_s32(v4s)); }
3191
3192
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l, const GSVector2& h)
3193
{
3194
return GSVector4(vcombine_f32(l.v2s, h.v2s));
3195
}
3196
3197
ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l) { return GSVector4(vcombine_f32(l.v2s, l.v2s)); }
3198
3199
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
3200
ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const \
3201
{ \
3202
return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); \
3203
} \
3204
ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const \
3205
{ \
3206
return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); \
3207
}
3208
3209
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
3210
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0); \
3211
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1); \
3212
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2); \
3213
VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
3214
3215
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
3216
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0); \
3217
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1); \
3218
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2); \
3219
VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
3220
3221
#define VECTOR4_SHUFFLE_1(xs, xn) \
3222
VECTOR4_SHUFFLE_2(xs, xn, x, 0); \
3223
VECTOR4_SHUFFLE_2(xs, xn, y, 1); \
3224
VECTOR4_SHUFFLE_2(xs, xn, z, 2); \
3225
VECTOR4_SHUFFLE_2(xs, xn, w, 3);
3226
3227
VECTOR4_SHUFFLE_1(x, 0);
3228
VECTOR4_SHUFFLE_1(y, 1);
3229
VECTOR4_SHUFFLE_1(z, 2);
3230
VECTOR4_SHUFFLE_1(w, 3);
3231
3232
#undef VECTOR4_SHUFFLE_1
3233
#undef VECTOR4_SHUFFLE_2
3234
#undef VECTOR4_SHUFFLE_3
3235
#undef VECTOR4_SHUFFLE_4
3236
3237
ALWAYS_INLINE GSVector4 broadcast32() const
3238
{
3239
#ifdef CPU_ARCH_ARM64
3240
return GSVector4(vdupq_laneq_f32(v4s, 0));
3241
#else
3242
return xxxx();
3243
#endif
3244
}
3245
3246
ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
3247
{
3248
#ifdef CPU_ARCH_ARM64
3249
return GSVector4(vdupq_laneq_f32(v.v4s, 0));
3250
#else
3251
return v.xxxx();
3252
#endif
3253
}
3254
3255
ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
3256
3257
ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
3258
{
3259
#ifdef CPU_ARCH_ARM64
3260
return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f)));
3261
#else
3262
return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f)));
3263
#endif
3264
}
3265
};
3266
3267
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
3268
{
3269
v2s = vcvt_s32_f32(v.v2s);
3270
}
3271
3272
ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
3273
{
3274
v2s = vcvt_f32_s32(v.v2s);
3275
}
3276
3277
ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
3278
{
3279
return GSVector2i(vreinterpret_s32_f32(v.v2s));
3280
}
3281
3282
ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
3283
{
3284
return GSVector2(vreinterpret_f32_s32(v.v2s));
3285
}
3286
3287
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
3288
{
3289
v4s = vcvtq_s32_f32(v.v4s);
3290
}
3291
3292
ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
3293
{
3294
v4s = vcvtq_f32_s32(v.v4s);
3295
}
3296
3297
ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
3298
{
3299
return GSVector4i(vreinterpretq_s32_f32(v.v4s));
3300
}
3301
3302
ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
3303
{
3304
return GSVector4(vreinterpretq_f32_s32(v.v4s));
3305
}
3306
3307