CoCalc -- gsvector

GitHub Repository: stenzek/duckstation
Path: blob/master/src/common/gsvector_neon.h
⁴⁸⁰² views
1
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>
2
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
3

4
#include "common/intrin.h"
5
#include "common/types.h"
6

7
#include <algorithm>
8
#include <cmath>
9
#include <cstdint>
10

11
#define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1
12
#define GSVECTOR_HAS_SRLV 1
13

14
#ifdef CPU_ARCH_ARM64
15
// tbl2 with 128-bit vectors is not in A32.
16
#define GSVECTOR_HAS_TBL2 1
17
#endif
18

19
class GSVector2;
20
class GSVector2i;
21
class GSVector4;
22
class GSVector4i;
23

24
class alignas(16) GSVector2i
25
{
26
  struct cxpr_init_tag
27
  {
28
  };
29
  static constexpr cxpr_init_tag cxpr_init{};
30

31
  constexpr GSVector2i(cxpr_init_tag, s32 x, s32 y) : S32{x, y} {}
32

33
  constexpr GSVector2i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
34

35
  constexpr GSVector2i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
36
    : S8{b0, b1, b2, b3, b4, b5, b6, b7}
37
  {
38
  }
39

40
public:
41
  union
42
  {
43
    struct
44
    {
45
      s32 x, y;
46
    };
47
    struct
48
    {
49
      s32 r, g;
50
    };
51
    float F32[2];
52
    s8 S8[8];
53
    s16 S16[4];
54
    s32 S32[2];
55
    s64 S64[1];
56
    u8 U8[8];
57
    u16 U16[4];
58
    u32 U32[2];
59
    u64 U64[1];
60
    int32x2_t v2s;
61
  };
62

63
  GSVector2i() = default;
64

65
  ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
66

67
  ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
68

69
  ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
70

71
  ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
72
  {
73
    return GSVector2i(cxpr_init, s0, s1, s2, s3);
74
  }
75

76
  ALWAYS_INLINE constexpr static GSVector2i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
77
  {
78
    return GSVector2i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7);
79
  }
80

81
  ALWAYS_INLINE GSVector2i(s32 x, s32 y) { v2s = vset_lane_s32(y, vdup_n_s32(x), 1); }
82

83
  ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) : S16{s0, s1, s2, s3} {}
84

85
  ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
86
    : S8{b0, b1, b2, b3, b4, b5, b6, b7}
87
  {
88
  }
89

90
  ALWAYS_INLINE explicit GSVector2i(int i) { *this = i; }
91

92
  ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {}
93

94
  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
95

96
  ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
97

98
  ALWAYS_INLINE void operator=(int i) { v2s = vdup_n_s32(i); }
99

100
  ALWAYS_INLINE operator int32x2_t() const { return v2s; }
101

102
  ALWAYS_INLINE GSVector2i sat_s8(const GSVector2i& min, const GSVector2i& max) const
103
  {
104
    return max_s8(min).min_s8(max);
105
  }
106
  ALWAYS_INLINE GSVector2i sat_s16(const GSVector2i& min, const GSVector2i& max) const
107
  {
108
    return max_s16(min).min_s16(max);
109
  }
110
  ALWAYS_INLINE GSVector2i sat_s32(const GSVector2i& min, const GSVector2i& max) const
111
  {
112
    return max_s32(min).min_s32(max);
113
  }
114

115
  ALWAYS_INLINE GSVector2i sat_u8(const GSVector2i& min, const GSVector2i& max) const
116
  {
117
    return max_u8(min).min_u8(max);
118
  }
119
  ALWAYS_INLINE GSVector2i sat_u16(const GSVector2i& min, const GSVector2i& max) const
120
  {
121
    return max_u16(min).min_u16(max);
122
  }
123
  ALWAYS_INLINE GSVector2i sat_u32(const GSVector2i& min, const GSVector2i& max) const
124
  {
125
    return max_u32(min).min_u32(max);
126
  }
127

128
  ALWAYS_INLINE GSVector2i min_s8(const GSVector2i& v) const
129
  {
130
    return GSVector2i(vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
131
  }
132

133
  ALWAYS_INLINE GSVector2i max_s8(const GSVector2i& v) const
134
  {
135
    return GSVector2i(vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
136
  }
137

138
  ALWAYS_INLINE GSVector2i min_s16(const GSVector2i& v) const
139
  {
140
    return GSVector2i(vreinterpret_s32_s16(vmin_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
141
  }
142

143
  ALWAYS_INLINE GSVector2i max_s16(const GSVector2i& v) const
144
  {
145
    return GSVector2i(vreinterpret_s32_s16(vmax_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
146
  }
147

148
  ALWAYS_INLINE GSVector2i min_s32(const GSVector2i& v) const { return GSVector2i(vmin_s32(v2s, v.v2s)); }
149

150
  ALWAYS_INLINE GSVector2i max_s32(const GSVector2i& v) const { return GSVector2i(vmax_s32(v2s, v.v2s)); }
151

152
  ALWAYS_INLINE GSVector2i min_u8(const GSVector2i& v) const
153
  {
154
    return GSVector2i(vreinterpret_s32_u8(vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
155
  }
156

157
  ALWAYS_INLINE GSVector2i max_u8(const GSVector2i& v) const
158
  {
159
    return GSVector2i(vreinterpret_s32_u8(vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
160
  }
161

162
  ALWAYS_INLINE GSVector2i min_u16(const GSVector2i& v) const
163
  {
164
    return GSVector2i(vreinterpret_s32_u16(vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
165
  }
166

167
  ALWAYS_INLINE GSVector2i max_u16(const GSVector2i& v) const
168
  {
169
    return GSVector2i(vreinterpret_s32_u16(vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
170
  }
171

172
  ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const
173
  {
174
    return GSVector2i(vreinterpret_s32_u32(vmin_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
175
  }
176

177
  ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const
178
  {
179
    return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
180
  }
181

182
  ALWAYS_INLINE s32 addv_s32() const
183
  {
184
#ifdef CPU_ARCH_ARM64
185
    return vaddv_s32(v2s);
186
#else
187
    return vget_lane_s32(v2s, 0) + vget_lane_s32(v2s, 1);
188
#endif
189
  }
190

191
#ifdef CPU_ARCH_ARM64
192

193
  ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); }
194

195
  ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); }
196

197
  ALWAYS_INLINE u16 minv_u16() const { return vminv_u16(vreinterpret_u16_s32(v2s)); }
198

199
  ALWAYS_INLINE u16 maxv_u16() const { return vmaxv_u16(vreinterpret_u16_s32(v2s)); }
200

201
  ALWAYS_INLINE s32 minv_s32() const { return vminv_s32(v2s); }
202

203
  ALWAYS_INLINE u32 minv_u32() const { return vminv_u32(v2s); }
204

205
  ALWAYS_INLINE s32 maxv_s32() const { return vmaxv_s32(v2s); }
206

207
  ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); }
208

209
#else
210

211
  ALWAYS_INLINE u8 minv_u8() const
212
  {
213
    uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
214
    return static_cast<u8>(
215
      std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
216
               std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
217
                        std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
218
  }
219

220
  ALWAYS_INLINE u16 maxv_u8() const
221
  {
222
    uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
223
    return static_cast<u8>(
224
      std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
225
               std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
226
                        std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
227
  }
228

229
  ALWAYS_INLINE u16 minv_u16() const
230
  {
231
    uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
232
    return static_cast<u16>(
233
      std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
234
  }
235

236
  ALWAYS_INLINE u16 maxv_u16() const
237
  {
238
    uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
239
    return static_cast<u16>(
240
      std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
241
  }
242

243
  ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
244

245
  ALWAYS_INLINE u32 minv_u32() const
246
  {
247
    return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
248
  }
249

250
  ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
251

252
  ALWAYS_INLINE u32 maxv_u32() const
253
  {
254
    return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
255
  }
256

257
#endif
258

259
  ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
260

261
  ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const
262
  {
263
    uint8x8_t mask2 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_s32(mask.v2s), 7));
264
    return GSVector2i(vreinterpret_s32_u8(vbsl_u8(mask2, vreinterpret_u8_s32(a.v2s), vreinterpret_u8_s32(v2s))));
265
  }
266

267
  template<int mask>
268
  ALWAYS_INLINE GSVector2i blend16(const GSVector2i& a) const
269
  {
270
    static constexpr const uint16_t _mask[4] = {
271
      ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
272
      ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0};
273
    return GSVector2i(
274
      vreinterpret_s32_u16(vbsl_u16(vld1_u16(_mask), vreinterpret_u16_s32(a.v2s), vreinterpret_u16_s32(v2s))));
275
  }
276

277
  template<int mask>
278
  ALWAYS_INLINE GSVector2i blend32(const GSVector2i& v) const
279
  {
280
    constexpr int bit1 = ((mask & 2) * 3) << 1;
281
    constexpr int bit0 = (mask & 1) * 3;
282
    return blend16 < bit1 | bit0 > (v);
283
  }
284

285
  ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
286
  {
287
    return GSVector2i(vreinterpret_s32_s8(vorr_s8(vbic_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(mask.v2s)),
288
                                                  vand_s8(vreinterpret_s8_s32(mask.v2s), vreinterpret_s8_s32(v.v2s)))));
289
  }
290

291
  ALWAYS_INLINE GSVector2i shuffle8(const GSVector2i& mask) const
292
  {
293
    return GSVector2i(vreinterpret_s32_s8(vtbl1_s8(vreinterpret_s8_s32(v2s), vreinterpret_u8_s32(mask.v2s))));
294
  }
295

296
  ALWAYS_INLINE GSVector2i ps16() const
297
  {
298
    return GSVector2i(vreinterpret_s32_s8(vqmovn_s16(vcombine_s16(vreinterpret_s16_s32(v2s), vcreate_s16(0)))));
299
  }
300

301
  ALWAYS_INLINE GSVector2i pu16() const
302
  {
303
    return GSVector2i(vreinterpret_s32_u8(vqmovn_u16(vcombine_u16(vreinterpret_u16_s32(v2s), vcreate_u16(0)))));
304
  }
305

306
  ALWAYS_INLINE GSVector2i ps32() const
307
  {
308
    return GSVector2i(vreinterpret_s32_s16(vqmovn_s16(vcombine_s32(v2s, vcreate_s32(0)))));
309
  }
310

311
  ALWAYS_INLINE GSVector2i pu32() const
312
  {
313
    return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0)))));
314
  }
315

316
#ifdef CPU_ARCH_ARM64
317

318
  ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
319
  {
320
    return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
321
  }
322

323
  ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
324
  {
325
    return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
326
  }
327
  ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip1_s32(v2s, v.v2s)); }
328

329
  ALWAYS_INLINE GSVector2i upl8() const
330
  {
331
    return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0))));
332
  }
333

334
  ALWAYS_INLINE GSVector2i upl16() const
335
  {
336
    return GSVector2i(vreinterpret_s32_s16(vzip1_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0))));
337
  }
338

339
  ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); }
340

341
#else
342

343
  ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
344
  {
345
    return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0]));
346
  }
347

348
  ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
349
  {
350
    return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0]));
351
  }
352
  ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); }
353

354
  ALWAYS_INLINE GSVector2i upl8() const
355
  {
356
    return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0]));
357
  }
358

359
  ALWAYS_INLINE GSVector2i upl16() const
360
  {
361
    return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0]));
362
  }
363

364
  ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); }
365

366
#endif
367

368
  ALWAYS_INLINE GSVector2i s8to16() const
369
  {
370
    return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s)))));
371
  }
372

373
  ALWAYS_INLINE GSVector2i u8to16() const
374
  {
375
    return GSVector2i(vreinterpret_s32_u16(vget_low_u8(vmovl_u8(vreinterpret_u8_s32(v2s)))));
376
  }
377

378
  ALWAYS_INLINE GSVector2i s8to32() const
379
  {
380
    return GSVector2i(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(v2s))))));
381
  }
382

383
  ALWAYS_INLINE GSVector2i u8to32() const
384
  {
385
    return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s32(v2s)))))));
386
  }
387

388
  ALWAYS_INLINE GSVector2i s16to32() const { return GSVector2i(vget_low_s32(vmovl_s16(vreinterpret_s16_s32(v2s)))); }
389

390
  ALWAYS_INLINE GSVector2i u16to32() const
391
  {
392
    return GSVector2i(vreinterpret_s32_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s32(v2s)))));
393
  }
394

395
  template<int i>
396
  ALWAYS_INLINE GSVector2i srl() const
397
  {
398
    return GSVector2i(vreinterpret_s32_s8(vext_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0), i)));
399
  }
400

401
  template<int i>
402
  ALWAYS_INLINE GSVector2i sll() const
403
  {
404
    return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i)));
405
  }
406

407
  template<int i>
408
  ALWAYS_INLINE GSVector2i sll16() const
409
  {
410
    return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i)));
411
  }
412

413
  ALWAYS_INLINE GSVector2i sll16(s32 i) const
414
  {
415
    return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i))));
416
  }
417

418
  ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const
419
  {
420
    return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
421
  }
422

423
  template<int i>
424
  ALWAYS_INLINE GSVector2i srl16() const
425
  {
426
    return GSVector2i(vreinterpret_s32_u16(vshr_n_u16(vreinterpret_u16_s32(v2s), i)));
427
  }
428

429
  ALWAYS_INLINE GSVector2i srl16(s32 i) const
430
  {
431
    return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_u16(-i))));
432
  }
433

434
  ALWAYS_INLINE GSVector2i srlv16(const GSVector2i& v) const
435
  {
436
    return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
437
  }
438

439
  template<int i>
440
  ALWAYS_INLINE GSVector2i sra16() const
441
  {
442
    constexpr int count = (i & ~15) ? 15 : i;
443
    return GSVector2i(vreinterpret_s32_s16(vshr_n_s16(vreinterpret_s16_s32(v2s), count)));
444
  }
445

446
  ALWAYS_INLINE GSVector2i sra16(s32 i) const
447
  {
448
    return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(-i))));
449
  }
450

451
  ALWAYS_INLINE GSVector2i srav16(const GSVector2i& v) const
452
  {
453
    return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vneg_s16(vreinterpret_s16_s32(v.v2s)))));
454
  }
455

456
  template<int i>
457
  ALWAYS_INLINE GSVector2i sll32() const
458
  {
459
    return GSVector2i(vshl_n_s32(v2s, i));
460
  }
461

462
  ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); }
463

464
  ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); }
465

466
  template<int i>
467
  ALWAYS_INLINE GSVector2i srl32() const
468
  {
469
    return GSVector2i(vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(v2s), i)));
470
  }
471

472
  ALWAYS_INLINE GSVector2i srl32(s32 i) const
473
  {
474
    return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(-i))));
475
  }
476

477
  ALWAYS_INLINE GSVector2i srlv32(const GSVector2i& v) const
478
  {
479
    return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s))));
480
  }
481

482
  template<int i>
483
  ALWAYS_INLINE GSVector2i sra32() const
484
  {
485
    return GSVector2i(vshr_n_s32(v2s, i));
486
  }
487

488
  ALWAYS_INLINE GSVector2i sra32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(-i))); }
489

490
  ALWAYS_INLINE GSVector2i srav32(const GSVector2i& v) const
491
  {
492
    return GSVector2i(vshl_s32(vreinterpret_u32_s32(v2s), vneg_s32(v.v2s)));
493
  }
494

495
  ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const
496
  {
497
    return GSVector2i(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
498
  }
499

500
  ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const
501
  {
502
    return GSVector2i(vreinterpret_s32_s16(vadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
503
  }
504

505
  ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(vadd_s32(v2s, v.v2s)); }
506

507
  ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const
508
  {
509
    return GSVector2i(vreinterpret_s32_s8(vqadd_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
510
  }
511

512
  ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const
513
  {
514
    return GSVector2i(vreinterpret_s32_s16(vqadd_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
515
  }
516

517
  ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const
518
  {
519
    return GSVector2i(vreinterpret_s32_u8(vqadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
520
  }
521

522
  ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const
523
  {
524
    return GSVector2i(vreinterpret_s32_u16(vqadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
525
  }
526

527
  ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const
528
  {
529
    return GSVector2i(vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
530
  }
531

532
  ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const
533
  {
534
    return GSVector2i(vreinterpret_s32_s16(vsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
535
  }
536

537
  ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(vsub_s32(v2s, v.v2s)); }
538

539
  ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const
540
  {
541
    return GSVector2i(vreinterpret_s32_s8(vqsub_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
542
  }
543

544
  ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const
545
  {
546
    return GSVector2i(vreinterpret_s32_s16(vqsub_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
547
  }
548

549
  ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const
550
  {
551
    return GSVector2i(vreinterpret_s32_u8(vqsub_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
552
  }
553

554
  ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const
555
  {
556
    return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
557
  }
558

559
  ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
560
  {
561
    return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
562
  }
563

564
  ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(vmul_s32(v2s, v.v2s)); }
565

566
  ALWAYS_INLINE bool eq(const GSVector2i& v) const
567
  {
568
    return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0);
569
  }
570

571
  ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const
572
  {
573
    return GSVector2i(vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
574
  }
575

576
  ALWAYS_INLINE GSVector2i eq16(const GSVector2i& v) const
577
  {
578
    return GSVector2i(vreinterpret_s32_u16(vceq_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
579
  }
580

581
  ALWAYS_INLINE GSVector2i eq32(const GSVector2i& v) const
582
  {
583
    return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s)));
584
  }
585

586
  ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
587

588
  ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
589

590
  ALWAYS_INLINE GSVector2i neq32(const GSVector2i& v) const { return ~eq32(v); }
591

592
  ALWAYS_INLINE GSVector2i gt8(const GSVector2i& v) const
593
  {
594
    return GSVector2i(vreinterpret_s32_s8(vcgt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
595
  }
596

597
  ALWAYS_INLINE GSVector2i gt16(const GSVector2i& v) const
598
  {
599
    return GSVector2i(vreinterpret_s32_s16(vcgt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
600
  }
601

602
  ALWAYS_INLINE GSVector2i gt32(const GSVector2i& v) const { return GSVector2i(vcgt_s32(v2s, v.v2s)); }
603

604
  ALWAYS_INLINE GSVector2i ge8(const GSVector2i& v) const
605
  {
606
    return GSVector2i(vreinterpret_s32_s8(vcge_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
607
  }
608
  ALWAYS_INLINE GSVector2i ge16(const GSVector2i& v) const
609
  {
610
    return GSVector2i(vreinterpret_s32_s16(vcge_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
611
  }
612
  ALWAYS_INLINE GSVector2i ge32(const GSVector2i& v) const { return GSVector2i(vcge_s32(v2s, v.v2s)); }
613

614
  ALWAYS_INLINE GSVector2i lt8(const GSVector2i& v) const
615
  {
616
    return GSVector2i(vreinterpret_s32_s8(vclt_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
617
  }
618

619
  ALWAYS_INLINE GSVector2i lt16(const GSVector2i& v) const
620
  {
621
    return GSVector2i(vreinterpret_s32_s16(vclt_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
622
  }
623

624
  ALWAYS_INLINE GSVector2i lt32(const GSVector2i& v) const { return GSVector2i(vclt_s32(v2s, v.v2s)); }
625

626
  ALWAYS_INLINE GSVector2i le8(const GSVector2i& v) const
627
  {
628
    return GSVector2i(vreinterpret_s32_s8(vcle_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
629
  }
630
  ALWAYS_INLINE GSVector2i le16(const GSVector2i& v) const
631
  {
632
    return GSVector2i(vreinterpret_s32_s16(vcle_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
633
  }
634
  ALWAYS_INLINE GSVector2i le32(const GSVector2i& v) const { return GSVector2i(vcle_s32(v2s, v.v2s)); }
635

636
  ALWAYS_INLINE GSVector2i andnot(const GSVector2i& v) const { return GSVector2i(vbic_s32(v2s, v.v2s)); }
637

638
  ALWAYS_INLINE int mask() const
639
  {
640
    // borrowed from sse2neon
641
    const uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(vreinterpret_u8_s32(v2s), 7));
642
    const uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
643
    const uint64x1_t paired32 = vreinterpret_u64_u32(vsra_n_u32(paired16, paired16, 14));
644
    const uint8x8_t paired64 = vreinterpret_u8_u64(vsra_n_u64(paired32, paired32, 28));
645
    return static_cast<int>(vget_lane_u8(paired64, 0));
646
  }
647

648
  ALWAYS_INLINE bool alltrue() const
649
  {
650
    return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
651
  }
652

653
  ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0)); }
654

655
  template<int i>
656
  ALWAYS_INLINE GSVector2i insert8(int a) const
657
  {
658
    return GSVector2i(vreinterpret_s32_u8(vset_lane_u8(a, vreinterpret_u8_s32(v2s), static_cast<uint8_t>(i))));
659
  }
660

661
  template<int i>
662
  ALWAYS_INLINE int extract8() const
663
  {
664
    return vget_lane_u8(vreinterpret_u8_s32(v2s), i);
665
  }
666

667
  template<int i>
668
  ALWAYS_INLINE GSVector2i insert16(int a) const
669
  {
670
    return GSVector2i(vreinterpret_s32_u16(vset_lane_u16(a, vreinterpret_u16_s32(v2s), static_cast<uint16_t>(i))));
671
  }
672

673
  template<int i>
674
  ALWAYS_INLINE int extract16() const
675
  {
676
    return vget_lane_u16(vreinterpret_u16_s32(v2s), i);
677
  }
678

679
  template<int i>
680
  ALWAYS_INLINE GSVector2i insert32(int a) const
681
  {
682
    return GSVector2i(vset_lane_s32(a, v2s, i));
683
  }
684

685
  template<int i>
686
  ALWAYS_INLINE int extract32() const
687
  {
688
    return vget_lane_s32(v2s, i);
689
  }
690

691
  ALWAYS_INLINE static GSVector2i load32(const void* p)
692
  {
693
    // should be ldr s0, [x0]
694
    u32 val;
695
    std::memcpy(&val, p, sizeof(u32));
696
    return GSVector2i(vset_lane_u32(val, vdup_n_u32(0), 0));
697
  }
698

699
  ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
700

701
  template<bool aligned>
702
  ALWAYS_INLINE static GSVector2i load(const void* p)
703
  {
704
#ifdef CPU_ARCH_ARM32
705
    if constexpr (!aligned)
706
      return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
707
#endif
708

709
    return GSVector2i(vld1_s32((const int32_t*)p));
710
  }
711

712
  ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
713
  {
714
    s32 val = vget_lane_s32(v, 0);
715
    std::memcpy(p, &val, sizeof(s32));
716
  }
717

718
  template<bool aligned>
719
  ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
720
  {
721
#ifdef CPU_ARCH_ARM32
722
    if constexpr (!aligned)
723
    {
724
      vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
725
      return;
726
    }
727
#endif
728

729
    vst1_s32((int32_t*)p, v.v2s);
730
  }
731

732
  ALWAYS_INLINE void operator&=(const GSVector2i& v)
733
  {
734
    v2s = vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
735
  }
736

737
  ALWAYS_INLINE void operator|=(const GSVector2i& v)
738
  {
739
    v2s = vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
740
  }
741

742
  ALWAYS_INLINE void operator^=(const GSVector2i& v)
743
  {
744
    v2s = vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)));
745
  }
746

747
  ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
748
  {
749
    return GSVector2i(vreinterpret_s32_s8(vand_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
750
  }
751

752
  ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v1, const GSVector2i& v2)
753
  {
754
    return GSVector2i(vreinterpret_s32_s8(vorr_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
755
  }
756

757
  ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v1, const GSVector2i& v2)
758
  {
759
    return GSVector2i(vreinterpret_s32_s8(veor_s8(vreinterpret_s8_s32(v1.v2s), vreinterpret_s8_s32(v2.v2s))));
760
  }
761

762
  ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v, int i) { return v & GSVector2i(i); }
763

764
  ALWAYS_INLINE friend GSVector2i operator|(const GSVector2i& v, int i) { return v | GSVector2i(i); }
765

766
  ALWAYS_INLINE friend GSVector2i operator^(const GSVector2i& v, int i) { return v ^ GSVector2i(i); }
767

768
  ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return GSVector2i(vmvn_s32(v.v2s)); }
769

770
  ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(0); }
771

772
  ALWAYS_INLINE GSVector2i xy() const { return *this; }
773
  ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 0, 0)); }
774
  ALWAYS_INLINE GSVector2i yx() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 0)); }
775
  ALWAYS_INLINE GSVector2i yy() const { return GSVector2i(__builtin_shufflevector(v2s, v2s, 1, 1)); }
776
};
777

778
class alignas(16) GSVector2
779
{
780
  struct cxpr_init_tag
781
  {
782
  };
783
  static constexpr cxpr_init_tag cxpr_init{};
784

785
  constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
786

787
  constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
788

789
public:
790
  union
791
  {
792
    struct
793
    {
794
      float x, y;
795
    };
796
    struct
797
    {
798
      float r, g;
799
    };
800
    float F32[2];
801
    double F64[1];
802
    s8 I8[8];
803
    s16 I16[4];
804
    s32 I32[2];
805
    s64 I64[1];
806
    u8 U8[8];
807
    u16 U16[4];
808
    u32 U32[2];
809
    u64 U64[1];
810
    float32x2_t v2s;
811
  };
812

813
  GSVector2() = default;
814

815
  constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
816

817
  constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
818

819
  constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
820

821
  constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
822

823
  ALWAYS_INLINE GSVector2(float x, float y) : v2s(vset_lane_f32(y, vdup_n_f32(x), 1)) {}
824

825
  ALWAYS_INLINE GSVector2(int x, int y) : v2s(vcvt_f32_s32(vset_lane_s32(y, vdup_n_s32(x), 1))) {}
826

827
  ALWAYS_INLINE constexpr explicit GSVector2(float32x2_t m) : v2s(m) {}
828

829
  ALWAYS_INLINE explicit GSVector2(float f) { v2s = vdup_n_f32(f); }
830

831
  ALWAYS_INLINE explicit GSVector2(int i) { v2s = vcvt_f32_s32(vdup_n_s32(i)); }
832

833
  ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
834

835
  ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
836

837
  ALWAYS_INLINE void operator=(float f) { v2s = vdup_n_f32(f); }
838

839
  ALWAYS_INLINE void operator=(float32x2_t m) { v2s = m; }
840

841
  ALWAYS_INLINE operator float32x2_t() const { return v2s; }
842

843
  ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
844
  ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
845

846
#ifdef CPU_ARCH_ARM64
847

848
  ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
849
  ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
850

851
#else
852

853
  ALWAYS_INLINE GSVector2 floor() const
854
  {
855
    return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1)));
856
  }
857

858
  ALWAYS_INLINE GSVector2 ceil() const
859
  {
860
    return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1)));
861
  }
862

863
#endif
864

865
  ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); }
866

867
  ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
868

869
  ALWAYS_INLINE GSVector2 clamp(const float scale = 255) const { return min(GSVector2(scale)); }
870

871
  ALWAYS_INLINE GSVector2 min(const GSVector2& a) const { return GSVector2(vmin_f32(v2s, a.v2s)); }
872

873
  ALWAYS_INLINE GSVector2 max(const GSVector2& a) const { return GSVector2(vmax_f32(v2s, a.v2s)); }
874

875
  template<int mask>
876
  ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const
877
  {
878
    return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1));
879
  }
880

881
  ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const
882
  {
883
    // duplicate sign bit across and bit select
884
    const uint32x2_t bitmask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(mask.v2s), 31));
885
    return GSVector2(vbsl_f32(bitmask, a.v2s, v2s));
886
  }
887

888
  ALWAYS_INLINE GSVector2 andnot(const GSVector2& v) const
889
  {
890
    return GSVector2(vreinterpret_f32_s32(vbic_s32(vreinterpret_s32_f32(v2s), vreinterpret_s32_f32(v.v2s))));
891
  }
892

893
  ALWAYS_INLINE int mask() const
894
  {
895
    const uint32x2_t masks = vshr_n_u32(vreinterpret_u32_s32(v2s), 31);
896
    return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1));
897
  }
898

899
  ALWAYS_INLINE bool alltrue() const
900
  {
901
    return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
902
  }
903

904
  ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0)); }
905

906
  ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
907

908
  template<int src, int dst>
909
  ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
910
  {
911
#ifdef CPU_ARCH_ARM64
912
    return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src));
913
#else
914
    return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst));
915
#endif
916
  }
917

918
  template<int i>
919
  ALWAYS_INLINE int extract32() const
920
  {
921
    return vget_lane_s32(vreinterpret_s32_f32(v2s), i);
922
  }
923

924
  ALWAYS_INLINE float dot(const GSVector2& v) const
925
  {
926
#ifdef CPU_ARCH_ARM64
927
    return vaddv_f32(vmul_f32(v2s, v.v2s));
928
#else
929
    const float32x2_t dp = vmul_f32(v2s, v.v2s);
930
    return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0);
931
#endif
932
  }
933

934
  ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); }
935

936
  ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }
937

938
  template<bool aligned>
939
  ALWAYS_INLINE static GSVector2 load(const void* p)
940
  {
941
#ifdef CPU_ARCH_ARM32
942
    if constexpr (!aligned)
943
      return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
944
#endif
945

946
    return GSVector2(vld1_f32(static_cast<const float*>(p)));
947
  }
948

949
  template<bool aligned>
950
  ALWAYS_INLINE static void store(void* p, const GSVector2& v)
951
  {
952
#ifdef CPU_ARCH_ARM32
953
    if constexpr (!aligned)
954
    {
955
      vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
956
      return;
957
    }
958
#endif
959

960
    vst1_f32(static_cast<float*>(p), v.v2s);
961
  }
962

963
  ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
964

965
  ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); }
966
  ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); }
967
  ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); }
968
  ALWAYS_INLINE void operator/=(const GSVector2& v)
969
  {
970
#ifdef CPU_ARCH_ARM64
971
    v2s = vdiv_f32(v2s, v.v2s);
972
#else
973
    *this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1));
974
#endif
975
  }
976

977
  ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
978
  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
979
  ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); }
980
  ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); }
981

982
  ALWAYS_INLINE void operator&=(const GSVector2& v)
983
  {
984
    v2s = vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
985
  }
986

987
  ALWAYS_INLINE void operator|=(const GSVector2& v)
988
  {
989
    v2s = vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
990
  }
991

992
  ALWAYS_INLINE void operator^=(const GSVector2& v)
993
  {
994
    v2s = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2s), vreinterpret_u32_f32(v.v2s)));
995
  }
996

997
  ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
998
  {
999
    return GSVector2(vadd_f32(v1.v2s, v2.v2s));
1000
  }
1001

1002
  ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v1, const GSVector2& v2)
1003
  {
1004
    return GSVector2(vsub_f32(v1.v2s, v2.v2s));
1005
  }
1006

1007
  ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v1, const GSVector2& v2)
1008
  {
1009
    return GSVector2(vmul_f32(v1.v2s, v2.v2s));
1010
  }
1011

1012
  ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
1013
  {
1014
#ifdef CPU_ARCH_ARM64
1015
    return GSVector2(vdiv_f32(v1.v2s, v2.v2s));
1016
#else
1017
    return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0),
1018
                     vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1));
1019
#endif
1020
  }
1021

1022
  ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
1023
  ALWAYS_INLINE friend GSVector2 operator-(const GSVector2& v, float f) { return v - GSVector2(f); }
1024
  ALWAYS_INLINE friend GSVector2 operator*(const GSVector2& v, float f) { return v * GSVector2(f); }
1025
  ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v, float f) { return v / GSVector2(f); }
1026

1027
  ALWAYS_INLINE friend GSVector2 operator&(const GSVector2& v1, const GSVector2& v2)
1028
  {
1029
    return GSVector2(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1030
  }
1031

1032
  ALWAYS_INLINE friend GSVector2 operator|(const GSVector2& v1, const GSVector2& v2)
1033
  {
1034
    return GSVector2(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1035
  }
1036

1037
  ALWAYS_INLINE friend GSVector2 operator^(const GSVector2& v1, const GSVector2& v2)
1038
  {
1039
    return GSVector2(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v1.v2s), vreinterpret_u32_f32(v2.v2s))));
1040
  }
1041

1042
  ALWAYS_INLINE friend GSVector2 operator==(const GSVector2& v1, const GSVector2& v2)
1043
  {
1044
    return GSVector2(vreinterpret_f32_u32(vceq_f32(v1.v2s, v2.v2s)));
1045
  }
1046

1047
  ALWAYS_INLINE friend GSVector2 operator!=(const GSVector2& v1, const GSVector2& v2)
1048
  {
1049
    // NEON has no !=
1050
    return GSVector2(vreinterpret_f32_u32(vmvn_u32(vceq_f32(v1.v2s, v2.v2s))));
1051
  }
1052

1053
  ALWAYS_INLINE friend GSVector2 operator>(const GSVector2& v1, const GSVector2& v2)
1054
  {
1055
    return GSVector2(vreinterpret_f32_u32(vcgt_f32(v1.v2s, v2.v2s)));
1056
  }
1057

1058
  ALWAYS_INLINE friend GSVector2 operator<(const GSVector2& v1, const GSVector2& v2)
1059
  {
1060
    return GSVector2(vreinterpret_f32_u32(vclt_f32(v1.v2s, v2.v2s)));
1061
  }
1062

1063
  ALWAYS_INLINE friend GSVector2 operator>=(const GSVector2& v1, const GSVector2& v2)
1064
  {
1065
    return GSVector2(vreinterpret_f32_u32(vcge_f32(v1.v2s, v2.v2s)));
1066
  }
1067

1068
  ALWAYS_INLINE friend GSVector2 operator<=(const GSVector2& v1, const GSVector2& v2)
1069
  {
1070
    return GSVector2(vreinterpret_f32_u32(vcle_f32(v1.v2s, v2.v2s)));
1071
  }
1072

1073
  ALWAYS_INLINE GSVector2 xy() const { return *this; }
1074
  ALWAYS_INLINE GSVector2 xx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 0, 0)); }
1075
  ALWAYS_INLINE GSVector2 yx() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 0)); }
1076
  ALWAYS_INLINE GSVector2 yy() const { return GSVector2(__builtin_shufflevector(v2s, v2s, 1, 1)); }
1077
};
1078

1079
class alignas(16) GSVector4i
1080
{
1081
  struct cxpr_init_tag
1082
  {
1083
  };
1084
  static constexpr cxpr_init_tag cxpr_init{};
1085

1086
  constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : S32{x, y, z, w} {}
1087

1088
  constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1089
    : S16{s0, s1, s2, s3, s4, s5, s6, s7}
1090
  {
1091
  }
1092

1093
  constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
1094
                       s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1095
    : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1096
  {
1097
  }
1098

1099
public:
1100
  union
1101
  {
1102
    struct
1103
    {
1104
      int x, y, z, w;
1105
    };
1106
    struct
1107
    {
1108
      int r, g, b, a;
1109
    };
1110
    struct
1111
    {
1112
      int left, top, right, bottom;
1113
    };
1114
    float F32[4];
1115
    s8 S8[16];
1116
    s16 S16[8];
1117
    s32 S32[4];
1118
    s64 S64[2];
1119
    u8 U8[16];
1120
    u16 U16[8];
1121
    u32 U32[4];
1122
    u64 U64[2];
1123
    int32x4_t v4s;
1124
  };
1125

1126
  GSVector4i() = default;
1127

1128
  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
1129
  {
1130
    return GSVector4i(cxpr_init, x, y, z, w);
1131
  }
1132

1133
  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
1134

1135
  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
1136

1137
  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1138
  {
1139
    return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
1140
  }
1141

1142
  ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
1143
                                                  s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
1144
  {
1145
    return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
1146
  }
1147

1148
  ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w)
1149
    : v4s(vsetq_lane_s32(w, vsetq_lane_s32(z, vsetq_lane_s32(y, vdupq_n_s32(x), 1), 2), 3))
1150
  {
1151
  }
1152

1153
  ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
1154
    : S16{s0, s1, s2, s3, s4, s5, s6, s7}
1155
  {
1156
  }
1157

1158
  constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, s8 b11, s8 b12,
1159
                       s8 b13, s8 b14, s8 b15)
1160
    : S8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
1161
  {
1162
  }
1163

1164
  ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) : v4s(vcombine_s32(v.v2s, vcreate_s32(0))) {}
1165

1166
  ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
1167

1168
  ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
1169

1170
  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v) : v4s(vcombine_s32(vcvt_s32_f32(v.v2s), vcreate_s32(0))) {}
1171
  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
1172

1173
  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
1174

1175
  ALWAYS_INLINE void operator=(s32 i) { v4s = vdupq_n_s32(i); }
1176

1177
  ALWAYS_INLINE operator int32x4_t() const { return v4s; }
1178

1179
  // rect
1180

1181
  ALWAYS_INLINE s32 width() const { return right - left; }
1182
  ALWAYS_INLINE s32 height() const { return bottom - top; }
1183

1184
  ALWAYS_INLINE GSVector2i rsize() const { return zwzw().sub32(xyxy()).xy(); }
1185

1186
  ALWAYS_INLINE bool rempty() const
1187
  {
1188
    // !any((x, y) < (z, w)) i.e. !not_empty
1189
    return (vget_lane_u64(vreinterpret_u64_u32(vclt_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) !=
1190
            0xFFFFFFFFFFFFFFFFULL);
1191
  }
1192

1193
  ALWAYS_INLINE bool rvalid() const
1194
  {
1195
    // !all((x, y) >= (z, w))
1196
    return (vget_lane_u64(vreinterpret_u64_u32(vcge_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == 0);
1197
  }
1198

1199
  ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_s32(a).upl64(max_s32(a).srl<8>()); }
1200

1201
  ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_s32(a); }
1202
  ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return rintersect(v).rvalid(); }
1203
  ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
1204

1205
  ALWAYS_INLINE u32 rgba32() const { return static_cast<u32>(ps32().pu16().extract32<0>()); }
1206

1207
  ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& min, const GSVector4i& max) const
1208
  {
1209
    return max_s8(min).min_s8(max);
1210
  }
1211
  ALWAYS_INLINE GSVector4i sat_s8(const GSVector4i& minmax) const
1212
  {
1213
    return max_s8(minmax.xyxy()).min_s8(minmax.zwzw());
1214
  }
1215
  ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& min, const GSVector4i& max) const
1216
  {
1217
    return max_s16(min).min_s16(max);
1218
  }
1219
  ALWAYS_INLINE GSVector4i sat_s16(const GSVector4i& minmax) const
1220
  {
1221
    return max_s16(minmax.xyxy()).min_s16(minmax.zwzw());
1222
  }
1223
  ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& min, const GSVector4i& max) const
1224
  {
1225
    return max_s32(min).min_s32(max);
1226
  }
1227
  ALWAYS_INLINE GSVector4i sat_s32(const GSVector4i& minmax) const
1228
  {
1229
    return max_s32(minmax.xyxy()).min_s32(minmax.zwzw());
1230
  }
1231

1232
  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
1233
  {
1234
    return max_u8(min).min_u8(max);
1235
  }
1236
  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
1237
  {
1238
    return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
1239
  }
1240
  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
1241
  {
1242
    return max_u16(min).min_u16(max);
1243
  }
1244
  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
1245
  {
1246
    return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
1247
  }
1248
  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
1249
  {
1250
    return max_u32(min).min_u32(max);
1251
  }
1252
  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
1253
  {
1254
    return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
1255
  }
1256

1257
  ALWAYS_INLINE GSVector4i min_s8(const GSVector4i& v) const
1258
  {
1259
    return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1260
  }
1261

1262
  ALWAYS_INLINE GSVector4i max_s8(const GSVector4i& v) const
1263
  {
1264
    return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1265
  }
1266

1267
  ALWAYS_INLINE GSVector4i min_s16(const GSVector4i& v) const
1268
  {
1269
    return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1270
  }
1271

1272
  ALWAYS_INLINE GSVector4i max_s16(const GSVector4i& v) const
1273
  {
1274
    return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1275
  }
1276

1277
  ALWAYS_INLINE GSVector4i min_s32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); }
1278

1279
  ALWAYS_INLINE GSVector4i max_s32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); }
1280

1281
  ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const
1282
  {
1283
    return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1284
  }
1285

1286
  ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const
1287
  {
1288
    return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1289
  }
1290

1291
  ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const
1292
  {
1293
    return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1294
  }
1295

1296
  ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const
1297
  {
1298
    return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1299
  }
1300

1301
  ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const
1302
  {
1303
    return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1304
  }
1305

1306
  ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const
1307
  {
1308
    return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
1309
  }
1310

1311
  ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
1312
  {
1313
#ifdef CPU_ARCH_ARM64
1314
    const int32x4_t acc =
1315
      vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1316
    return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
1317
#else
1318
    // borrowed from sse2neon
1319
    const int32x4_t low =
1320
      vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1321
    const int32x4_t high =
1322
      vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1323
    return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
1324
                                   vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
1325
#endif
1326
  }
1327

1328
  ALWAYS_INLINE GSVector4i addp_s32() const
1329
  {
1330
#ifdef CPU_ARCH_ARM64
1331
    return GSVector4i(vpaddq_s32(v4s, v4s));
1332
#else
1333
    const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1334
    return GSVector4i(vcombine_s32(res, res));
1335
#endif
1336
  }
1337

1338
  ALWAYS_INLINE s32 addv_s32() const
1339
  {
1340
#ifdef CPU_ARCH_ARM64
1341
    return vaddvq_s32(v4s);
1342
#else
1343
    const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1344
    return vget_lane_s32(res, 0) + vget_lane_s32(res, 1);
1345
#endif
1346
  }
1347

1348
#ifdef CPU_ARCH_ARM64
1349

1350
  ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); }
1351

1352
  ALWAYS_INLINE u16 maxv_u8() const { return vmaxvq_u8(vreinterpretq_u8_s32(v4s)); }
1353

1354
  ALWAYS_INLINE u16 minv_u16() const { return vminvq_u16(vreinterpretq_u16_s32(v4s)); }
1355

1356
  ALWAYS_INLINE u16 maxv_u16() const { return vmaxvq_u16(vreinterpretq_u16_s32(v4s)); }
1357

1358
  ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); }
1359

1360
  ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(v4s); }
1361

1362
  ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); }
1363

1364
  ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }
1365

1366
#else
1367

1368
  ALWAYS_INLINE u8 minv_u8() const
1369
  {
1370
    uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1371
    vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1)));
1372
    return static_cast<u8>(
1373
      std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
1374
               std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
1375
                        std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
1376
  }
1377

1378
  ALWAYS_INLINE u16 maxv_u8() const
1379
  {
1380
    uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
1381
    vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1)));
1382
    return static_cast<u8>(
1383
      std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
1384
               std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
1385
                        std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
1386
  }
1387

1388
  ALWAYS_INLINE u16 minv_u16() const
1389
  {
1390
    uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1391
    vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1)));
1392
    return static_cast<u16>(
1393
      std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
1394
  }
1395

1396
  ALWAYS_INLINE u16 maxv_u16() const
1397
  {
1398
    uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
1399
    vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1)));
1400
    return static_cast<u16>(
1401
      std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
1402
  }
1403

1404
  ALWAYS_INLINE s32 minv_s32() const
1405
  {
1406
    int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1407
    return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1));
1408
  }
1409

1410
  ALWAYS_INLINE u32 minv_u32() const
1411
  {
1412
    uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1413
    return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1));
1414
  }
1415

1416
  ALWAYS_INLINE s32 maxv_s32() const
1417
  {
1418
    int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s));
1419
    return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1));
1420
  }
1421

1422
  ALWAYS_INLINE u32 maxv_u32() const
1423
  {
1424
    uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
1425
    return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1));
1426
  }
1427

1428
#endif
1429

1430
  ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
1431

1432
  ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
1433
  {
1434
    uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7));
1435
    return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s))));
1436
  }
1437

1438
  template<int mask>
1439
  ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const
1440
  {
1441
    return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(
1442
      vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(a.v4s), ((mask & 0x01) == 0) ? 0 : 8,
1443
      ((mask & 0x02) == 0) ? 1 : 9, ((mask & 0x04) == 0) ? 2 : 10, ((mask & 0x08) == 0) ? 3 : 11,
1444
      ((mask & 0x10) == 0) ? 4 : 12, ((mask & 0x20) == 0) ? 5 : 13, ((mask & 0x40) == 0) ? 6 : 14,
1445
      ((mask & 0x80) == 0) ? 7 : 15)));
1446
  }
1447

1448
  template<int mask>
1449
  ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
1450
  {
1451
    return GSVector4i(__builtin_shufflevector(v4s, v.v4s, ((mask & 1) == 0) ? 0 : 4, ((mask & 2) == 0) ? 1 : 5,
1452
                                              ((mask & 4) == 0) ? 2 : 6, ((mask & 8) == 0) ? 3 : 7));
1453
  }
1454

1455
  ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
1456
  {
1457
    return GSVector4i(
1458
      vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)),
1459
                                    vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
1460
  }
1461

1462
  ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
1463
  {
1464
#ifdef CPU_ARCH_ARM64
1465
    return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
1466
#else
1467
    int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))};
1468
    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))),
1469
                                                       vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s))))));
1470
#endif
1471
  }
1472

1473
  ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
1474
  {
1475
    return GSVector4i(vreinterpretq_s32_s8(
1476
      vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s)))));
1477
  }
1478

1479
  ALWAYS_INLINE GSVector4i ps16() const
1480
  {
1481
    return GSVector4i(vreinterpretq_s32_s8(
1482
      vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s)))));
1483
  }
1484

1485
  ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const
1486
  {
1487
    return GSVector4i(vreinterpretq_s32_u8(
1488
      vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s)))));
1489
  }
1490

1491
  ALWAYS_INLINE GSVector4i pu16() const
1492
  {
1493
    return GSVector4i(vreinterpretq_s32_u8(
1494
      vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s)))));
1495
  }
1496

1497
  ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const
1498
  {
1499
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s))));
1500
  }
1501

1502
  ALWAYS_INLINE GSVector4i ps32() const
1503
  {
1504
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s))));
1505
  }
1506

1507
  ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
1508
  {
1509
    return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s))));
1510
  }
1511

1512
  ALWAYS_INLINE GSVector4i pu32() const
1513
  {
1514
    return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
1515
  }
1516

1517
#ifdef CPU_ARCH_ARM64
1518

1519
  ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1520
  {
1521
    return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1522
  }
1523

1524
  ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1525
  {
1526
    return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1527
  }
1528

1529
  ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1530
  {
1531
    return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1532
  }
1533

1534
  ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1535
  {
1536
    return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1537
  }
1538

1539
  ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); }
1540

1541
  ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); }
1542

1543
  ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1544
  {
1545
    return GSVector4i(vreinterpretq_s32_s64(
1546
      vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1547
  }
1548

1549
  ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1550
  {
1551
    return GSVector4i(vreinterpretq_s32_s64(
1552
      vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1553
  }
1554

1555
  ALWAYS_INLINE GSVector4i upl8() const
1556
  {
1557
    return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1558
  }
1559

1560
  ALWAYS_INLINE GSVector4i uph8() const
1561
  {
1562
    return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
1563
  }
1564

1565
  ALWAYS_INLINE GSVector4i upl16() const
1566
  {
1567
    return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1568
  }
1569

1570
  ALWAYS_INLINE GSVector4i uph16() const
1571
  {
1572
    return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
1573
  }
1574

1575
  ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); }
1576

1577
  ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); }
1578

1579
  ALWAYS_INLINE GSVector4i upl64() const
1580
  {
1581
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1582
  }
1583

1584
  ALWAYS_INLINE GSVector4i uph64() const
1585
  {
1586
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1587
  }
1588

1589
#else
1590

1591
  ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
1592
  {
1593
    const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
1594
    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1595
  }
1596

1597
  ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
1598
  {
1599
    const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
1600
    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
1601
  }
1602

1603
  ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
1604
  {
1605
    const int16x4x2_t res =
1606
      vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1607
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1608
  }
1609

1610
  ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
1611
  {
1612
    const int16x4x2_t res =
1613
      vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1614
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
1615
  }
1616

1617
  ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const
1618
  {
1619
    const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s));
1620
    return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1621
  }
1622

1623
  ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const
1624
  {
1625
    const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s));
1626
    return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
1627
  }
1628

1629
  ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
1630
  {
1631
    return GSVector4i(vreinterpretq_s32_s64(
1632
      vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
1633
  }
1634

1635
  ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
1636
  {
1637
    return GSVector4i(vreinterpretq_s32_s64(
1638
      vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
1639
  }
1640

1641
  ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); }
1642

1643
  ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); }
1644

1645
  ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); }
1646

1647
  ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); }
1648

1649
  ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); }
1650

1651
  ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); }
1652

1653
  ALWAYS_INLINE GSVector4i upl64() const
1654
  {
1655
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1656
  }
1657

1658
  ALWAYS_INLINE GSVector4i uph64() const
1659
  {
1660
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
1661
  }
1662
#endif
1663

1664
  ALWAYS_INLINE GSVector4i s8to16() const
1665
  {
1666
    return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
1667
  }
1668

1669
  ALWAYS_INLINE GSVector4i u8to16() const
1670
  {
1671
    return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))));
1672
  }
1673

1674
  ALWAYS_INLINE GSVector4i s8to32() const
1675
  {
1676
    return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))));
1677
  }
1678

1679
  ALWAYS_INLINE GSVector4i u8to32() const
1680
  {
1681
    return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))));
1682
  }
1683

1684
  ALWAYS_INLINE GSVector4i s8to64() const
1685
  {
1686
    return GSVector4i(vreinterpretq_s32_s64(
1687
      vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))))));
1688
  }
1689

1690
  ALWAYS_INLINE GSVector4i u8to64() const
1691
  {
1692
    return GSVector4i(vreinterpretq_s32_u64(
1693
      vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))))));
1694
  }
1695

1696
  ALWAYS_INLINE GSVector4i s16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); }
1697

1698
  ALWAYS_INLINE GSVector4i u16to32() const
1699
  {
1700
    return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))));
1701
  }
1702

1703
  ALWAYS_INLINE GSVector4i s16to64() const
1704
  {
1705
    return GSVector4i(
1706
      vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))))));
1707
  }
1708

1709
  ALWAYS_INLINE GSVector4i u16to64() const
1710
  {
1711
    return GSVector4i(
1712
      vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))))));
1713
  }
1714

1715
  ALWAYS_INLINE GSVector4i s32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); }
1716

1717
  ALWAYS_INLINE GSVector4i u32to64() const
1718
  {
1719
    return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)))));
1720
  }
1721

1722
  template<int i>
1723
  ALWAYS_INLINE GSVector4i srl() const
1724
  {
1725
    return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i)));
1726
  }
1727

1728
  template<int i>
1729
  ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
1730
  {
1731
    if constexpr (i >= 16)
1732
      return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16)));
1733
    else
1734
      return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i)));
1735
  }
1736

1737
  template<int i>
1738
  ALWAYS_INLINE GSVector4i sll() const
1739
  {
1740
    return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i)));
1741
  }
1742

1743
  template<int i>
1744
  ALWAYS_INLINE GSVector4i sll16() const
1745
  {
1746
    return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
1747
  }
1748

1749
  ALWAYS_INLINE GSVector4i sll16(s32 i) const
1750
  {
1751
    return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
1752
  }
1753

1754
  ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
1755
  {
1756
    return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1757
  }
1758

1759
  template<int i>
1760
  ALWAYS_INLINE GSVector4i srl16() const
1761
  {
1762
    return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i)));
1763
  }
1764

1765
  ALWAYS_INLINE GSVector4i srl16(s32 i) const
1766
  {
1767
    return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i))));
1768
  }
1769

1770
  ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
1771
  {
1772
    return GSVector4i(
1773
      vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1774
  }
1775

1776
  template<int i>
1777
  ALWAYS_INLINE GSVector4i sra16() const
1778
  {
1779
    constexpr int count = (i & ~15) ? 15 : i;
1780
    return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count)));
1781
  }
1782

1783
  ALWAYS_INLINE GSVector4i sra16(s32 i) const
1784
  {
1785
    return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i))));
1786
  }
1787

1788
  ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const
1789
  {
1790
    return GSVector4i(
1791
      vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
1792
  }
1793

1794
  template<int i>
1795
  ALWAYS_INLINE GSVector4i sll32() const
1796
  {
1797
    return GSVector4i(vshlq_n_s32(v4s, i));
1798
  }
1799

1800
  ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
1801

1802
  ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
1803

1804
  template<int i>
1805
  ALWAYS_INLINE GSVector4i srl32() const
1806
  {
1807
    return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i)));
1808
  }
1809

1810
  ALWAYS_INLINE GSVector4i srl32(s32 i) const
1811
  {
1812
    return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i))));
1813
  }
1814

1815
  ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const
1816
  {
1817
    return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))));
1818
  }
1819

1820
  template<int i>
1821
  ALWAYS_INLINE GSVector4i sra32() const
1822
  {
1823
    return GSVector4i(vshrq_n_s32(v4s, i));
1824
  }
1825

1826
  ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); }
1827

1828
  ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const
1829
  {
1830
    return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)));
1831
  }
1832

1833
  template<int i>
1834
  ALWAYS_INLINE GSVector4i sll64() const
1835
  {
1836
    return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
1837
  }
1838

1839
  ALWAYS_INLINE GSVector4i sll64(s32 i) const
1840
  {
1841
    return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i))));
1842
  }
1843

1844
  ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
1845
  {
1846
    return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
1847
  }
1848

1849
  template<int i>
1850
  ALWAYS_INLINE GSVector4i srl64() const
1851
  {
1852
    return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i)));
1853
  }
1854

1855
  ALWAYS_INLINE GSVector4i srl64(s32 i) const
1856
  {
1857
    return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
1858
  }
1859

1860
#ifdef CPU_ARCH_ARM64
1861
  ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
1862
  {
1863
    return GSVector4i(
1864
      vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
1865
  }
1866
#endif
1867

1868
  ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
1869
  {
1870
    return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1871
  }
1872

1873
  ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const
1874
  {
1875
    return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1876
  }
1877

1878
  ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); }
1879

1880
  ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const
1881
  {
1882
    return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1883
  }
1884

1885
  ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const
1886
  {
1887
    return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1888
  }
1889

1890
  ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const
1891
  {
1892
    // can't use vpaddq_s16() here, because we need saturation.
1893
    // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1894
    const int16x8_t a = vreinterpretq_s16_s32(v4s);
1895
    const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
1896
#ifdef CPU_ARCH_ARM64
1897
    return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
1898
#else
1899
    // sse2neon again
1900
    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
1901
    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
1902
    return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357)));
1903
#endif
1904
  }
1905

1906
  ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
1907
  {
1908
    return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1909
  }
1910

1911
  ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const
1912
  {
1913
    return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1914
  }
1915

1916
  ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const
1917
  {
1918
    return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1919
  }
1920

1921
  ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const
1922
  {
1923
    return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1924
  }
1925

1926
  ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); }
1927

1928
  ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const
1929
  {
1930
    return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
1931
  }
1932

1933
  ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const
1934
  {
1935
    return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1936
  }
1937

1938
  ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const
1939
  {
1940
    return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1941
  }
1942

1943
  ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const
1944
  {
1945
    return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1946
  }
1947

1948
  ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const
1949
  {
1950
    return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
1951
  }
1952

1953
  ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const
1954
  {
1955
    return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
1956
  }
1957

1958
  ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const
1959
  {
1960
    // from sse2neon
1961
    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s));
1962
    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s));
1963
    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
1964
    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s));
1965
    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s));
1966
    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
1967
    uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
1968
    return GSVector4i(vreinterpretq_s32_u16(r.val[1]));
1969
  }
1970

1971
  ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const
1972
  {
1973
    return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
1974
  }
1975

1976
  ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const
1977
  {
1978
    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
1979
    int32x4_t mul_hi =
1980
      vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
1981
    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
1982
    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
1983
    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi)));
1984
  }
1985

1986
  ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
1987

1988
  ALWAYS_INLINE bool eq(const GSVector4i& v) const
1989
  {
1990
    const int32x4_t res = veorq_s32(v4s, v.v4s);
1991
#ifdef CPU_ARCH_ARM64
1992
    return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0);
1993
#else
1994
    const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res));
1995
    return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0);
1996
#endif
1997
  }
1998

1999
  ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
2000
  {
2001
    return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2002
  }
2003

2004
  ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const
2005
  {
2006
    return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2007
  }
2008

2009
  ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const
2010
  {
2011
    return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
2012
  }
2013

2014
#ifdef CPU_ARCH_ARM64
2015
  ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
2016
  {
2017
    return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
2018
  }
2019
#endif
2020

2021
  ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
2022

2023
  ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
2024

2025
  ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
2026

2027
  ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const
2028
  {
2029
    return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2030
  }
2031

2032
  ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const
2033
  {
2034
    return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2035
  }
2036

2037
  ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(vcgtq_s32(v4s, v.v4s)); }
2038

2039
  ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const
2040
  {
2041
    return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2042
  }
2043
  ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const
2044
  {
2045
    return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2046
  }
2047
  ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return GSVector4i(vcgeq_s32(v4s, v.v4s)); }
2048

2049
  ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const
2050
  {
2051
    return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2052
  }
2053

2054
  ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const
2055
  {
2056
    return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2057
  }
2058

2059
  ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(vcltq_s32(v4s, v.v4s)); }
2060

2061
  ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const
2062
  {
2063
    return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
2064
  }
2065
  ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const
2066
  {
2067
    return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
2068
  }
2069
  ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return GSVector4i(vcleq_s32(v4s, v.v4s)); }
2070

2071
  ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); }
2072

2073
  ALWAYS_INLINE int mask() const
2074
  {
2075
    // borrowed from sse2neon
2076
    const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7));
2077
    const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2078
    const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2079
    const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2080
    return static_cast<int>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
2081
  }
2082

2083
  ALWAYS_INLINE bool alltrue() const
2084
  {
2085
#ifdef CPU_ARCH_ARM64
2086
    return (vminvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0xFFFFFFFF));
2087
#else
2088
    return (vget_lane_u64(vreinterpret_u64_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) ==
2089
            UINT64_C(0xFFFFFFFFFFFFFFFF));
2090
#endif
2091
  }
2092

2093
  ALWAYS_INLINE bool allfalse() const
2094
  {
2095
#ifdef CPU_ARCH_ARM64
2096
    return (vmaxvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0));
2097
#else
2098
    return (vget_lane_u64(vreinterpret_u64_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == UINT64_C(0));
2099
#endif
2100
  }
2101

2102
  template<int i>
2103
  ALWAYS_INLINE GSVector4i insert8(int a) const
2104
  {
2105
    return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast<uint8_t>(i))));
2106
  }
2107

2108
  template<int i>
2109
  ALWAYS_INLINE int extract8() const
2110
  {
2111
    return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i);
2112
  }
2113

2114
  template<int i>
2115
  ALWAYS_INLINE GSVector4i insert16(int a) const
2116
  {
2117
    return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast<uint16_t>(i))));
2118
  }
2119

2120
  template<int i>
2121
  ALWAYS_INLINE int extract16() const
2122
  {
2123
    return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i);
2124
  }
2125

2126
  template<int i>
2127
  ALWAYS_INLINE GSVector4i insert32(int a) const
2128
  {
2129
    return GSVector4i(vsetq_lane_s32(a, v4s, i));
2130
  }
2131

2132
  template<int i>
2133
  ALWAYS_INLINE int extract32() const
2134
  {
2135
    return vgetq_lane_s32(v4s, i);
2136
  }
2137

2138
  template<int i>
2139
  ALWAYS_INLINE GSVector4i insert64(s64 a) const
2140
  {
2141
    return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i)));
2142
  }
2143

2144
  template<int i>
2145
  ALWAYS_INLINE s64 extract64() const
2146
  {
2147
    return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i);
2148
  }
2149

2150
#ifdef CPU_ARCH_ARM64
2151
  ALWAYS_INLINE GSVector4i tbl2(const GSVector4i& a, const GSVector4i& b, const GSVector4i& idx)
2152
  {
2153
    return GSVector4i(vreinterpretq_s32_u8(
2154
      vqtbx2q_u8(vreinterpretq_u8_s32(v4s), uint8x16x2_t{vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(b.v4s)},
2155
                 vreinterpretq_u8_s32(idx.v4s))));
2156
  }
2157
#endif
2158

2159
  ALWAYS_INLINE static GSVector4i loadnt(const void* p)
2160
  {
2161
#if __has_builtin(__builtin_nontemporal_store)
2162
    return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p));
2163
#else
2164
    return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2165
#endif
2166
  }
2167

2168
  ALWAYS_INLINE static GSVector4i load32(const void* p)
2169
  {
2170
    // should be ldr s0, [x0]
2171
    u32 val;
2172
    std::memcpy(&val, p, sizeof(u32));
2173
    return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0));
2174
  }
2175

2176
  ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }
2177

2178
  template<bool aligned>
2179
  ALWAYS_INLINE static GSVector4i loadl(const void* p)
2180
  {
2181
#ifdef CPU_ARCH_ARM32
2182
    if constexpr (!aligned)
2183
      return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
2184
#endif
2185

2186
    return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
2187
  }
2188

2189
  ALWAYS_INLINE static GSVector4i loadl(const GSVector2i& v) { return GSVector4i(vcombine_s32(v.v2s, vcreate_s32(0))); }
2190

2191
  template<bool aligned>
2192
  ALWAYS_INLINE static GSVector4i loadh(const void* p)
2193
  {
2194
#ifdef CPU_ARCH_ARM32
2195
    if constexpr (!aligned)
2196
      return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
2197
#endif
2198

2199
    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
2200
  }
2201

2202
  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return GSVector4i(vcombine_s32(vcreate_s32(0), v.v2s)); }
2203

2204
  template<bool aligned>
2205
  ALWAYS_INLINE static GSVector4i load(const void* p)
2206
  {
2207
#ifdef CPU_ARCH_ARM32
2208
    if constexpr (!aligned)
2209
      return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
2210
#endif
2211

2212
    return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
2213
  }
2214

2215
  ALWAYS_INLINE static void storent(void* p, const GSVector4i& v)
2216
  {
2217
#if __has_builtin(__builtin_nontemporal_store)
2218
    __builtin_nontemporal_store(v.v4s, static_cast<int32x4_t*>(p));
2219
#else
2220
    vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2221
#endif
2222
  }
2223

2224
  ALWAYS_INLINE static void store32(void* p, const GSVector4i& v)
2225
  {
2226
    u32 val = vgetq_lane_s32(v, 0);
2227
    std::memcpy(p, &val, sizeof(u32));
2228
  }
2229

2230
  template<bool aligned>
2231
  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
2232
  {
2233
#ifdef CPU_ARCH_ARM32
2234
    if constexpr (!aligned)
2235
    {
2236
      vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
2237
      return;
2238
    }
2239
#endif
2240

2241
    vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
2242
  }
2243

2244
  template<bool aligned>
2245
  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
2246
  {
2247
#ifdef CPU_ARCH_ARM32
2248
    if constexpr (!aligned)
2249
    {
2250
      vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
2251
      return;
2252
    }
2253
#endif
2254

2255
    vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
2256
  }
2257

2258
  template<bool aligned>
2259
  ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
2260
  {
2261
#ifdef CPU_ARCH_ARM32
2262
    if constexpr (!aligned)
2263
    {
2264
      vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
2265
      return;
2266
    }
2267
#endif
2268

2269
    vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
2270
  }
2271

2272
  ALWAYS_INLINE static GSVector4i broadcast128(const GSVector4i& v) { return v; }
2273

2274
  template<bool aligned>
2275
  ALWAYS_INLINE static GSVector4i broadcast128(const void* v)
2276
  {
2277
    return load<aligned>(v);
2278
  }
2279

2280
  ALWAYS_INLINE void operator&=(const GSVector4i& v)
2281
  {
2282
    v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2283
  }
2284

2285
  ALWAYS_INLINE void operator|=(const GSVector4i& v)
2286
  {
2287
    v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2288
  }
2289

2290
  ALWAYS_INLINE void operator^=(const GSVector4i& v)
2291
  {
2292
    v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
2293
  }
2294

2295
  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
2296
  {
2297
    return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2298
  }
2299

2300
  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
2301
  {
2302
    return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2303
  }
2304

2305
  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
2306
  {
2307
    return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
2308
  }
2309

2310
  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); }
2311

2312
  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); }
2313

2314
  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); }
2315

2316
  ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); }
2317

2318
  ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); }
2319

2320
  ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); }
2321

2322
  ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
2323

2324
  ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xy, const GSVector2i& zw)
2325
  {
2326
    return GSVector4i(vcombine_s32(xy.v2s, zw.v2s));
2327
  }
2328

2329
  ALWAYS_INLINE static GSVector4i xyxy(const GSVector2i& xyzw) { return GSVector4i(vcombine_s32(xyzw.v2s, xyzw.v2s)); }
2330

2331
  static GSVector4i rfit(const GSVector4i& fit_rect, const GSVector2i& image_size);
2332

2333
  ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(vget_low_s32(v4s)); }
2334

2335
  ALWAYS_INLINE GSVector2i zw() const { return GSVector2i(vget_high_s32(v4s)); }
2336

2337
#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                             \
2338
  ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const                                                                      \
2339
  {                                                                                                                    \
2340
    return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn));                                              \
2341
  }                                                                                                                    \
2342
  ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const                                                                   \
2343
  {                                                                                                                    \
2344
    return GSVector4i(vreinterpretq_s32_s16(                                                                           \
2345
      __builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), xn, yn, zn, wn, 4, 5, 6, 7)));   \
2346
  }                                                                                                                    \
2347
  ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const                                                                   \
2348
  {                                                                                                                    \
2349
    return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(                                                   \
2350
      vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 4 + xn, 4 + yn, 4 + zn, 4 + wn)));           \
2351
  }
2352

2353
#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                     \
2354
  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                    \
2355
  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                    \
2356
  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                    \
2357
  VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
2358

2359
#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn)                                                                             \
2360
  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                            \
2361
  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                            \
2362
  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                            \
2363
  VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3);
2364

2365
#define VECTOR4i_SHUFFLE_1(xs, xn)                                                                                     \
2366
  VECTOR4i_SHUFFLE_2(xs, xn, x, 0);                                                                                    \
2367
  VECTOR4i_SHUFFLE_2(xs, xn, y, 1);                                                                                    \
2368
  VECTOR4i_SHUFFLE_2(xs, xn, z, 2);                                                                                    \
2369
  VECTOR4i_SHUFFLE_2(xs, xn, w, 3);
2370

2371
  VECTOR4i_SHUFFLE_1(x, 0);
2372
  VECTOR4i_SHUFFLE_1(y, 1);
2373
  VECTOR4i_SHUFFLE_1(z, 2);
2374
  VECTOR4i_SHUFFLE_1(w, 3);
2375

2376
#undef VECTOR4i_SHUFFLE_1
2377
#undef VECTOR4i_SHUFFLE_2
2378
#undef VECTOR4i_SHUFFLE_3
2379
#undef VECTOR4i_SHUFFLE_4
2380
};
2381

2382
class alignas(16) GSVector4
2383
{
2384
  struct cxpr_init_tag
2385
  {
2386
  };
2387
  static constexpr cxpr_init_tag cxpr_init{};
2388

2389
  constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
2390

2391
  constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
2392

2393
  constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
2394

2395
  constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
2396

2397
public:
2398
  union
2399
  {
2400
    struct
2401
    {
2402
      float x, y, z, w;
2403
    };
2404
    struct
2405
    {
2406
      float r, g, b, a;
2407
    };
2408
    struct
2409
    {
2410
      float left, top, right, bottom;
2411
    };
2412
    float F32[4];
2413
    double F64[2];
2414
    s8 I8[16];
2415
    s16 I16[8];
2416
    s32 I32[4];
2417
    s64 I64[2];
2418
    u8 U8[16];
2419
    u16 U16[8];
2420
    u32 U32[4];
2421
    u64 U64[2];
2422
    float32x4_t v4s;
2423
  };
2424

2425
  GSVector4() = default;
2426

2427
  constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
2428
  constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
2429

2430
  constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
2431
  constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
2432

2433
  constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
2434
  constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
2435

2436
  constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
2437
  constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
2438

2439
  constexpr static GSVector4 cxpr_rgba32(u32 rgba)
2440
  {
2441
    return GSVector4(cxpr_init, static_cast<float>(rgba & 0xff), static_cast<float>((rgba >> 8) & 0xff),
2442
                     static_cast<float>((rgba >> 16) & 0xff), static_cast<float>((rgba >> 24) & 0xff));
2443
  }
2444

2445
  constexpr static GSVector4 cxpr_unorm8(u32 rgba)
2446
  {
2447
    return GSVector4(cxpr_init, static_cast<float>(rgba & 0xff) / 255.0f,
2448
                     static_cast<float>((rgba >> 8) & 0xff) / 255.0f, static_cast<float>((rgba >> 16) & 0xff) / 255.0f,
2449
                     static_cast<float>((rgba >> 24) & 0xff) / 255.0f);
2450
  }
2451

2452
  ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
2453
  {
2454
    const float arr[4] = {x, y, z, w};
2455
    v4s = vld1q_f32(arr);
2456
  }
2457

2458
  ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); }
2459

2460
  ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
2461
  {
2462
    const int arr[4] = {x, y, z, w};
2463
    v4s = vcvtq_f32_s32(vld1q_s32(arr));
2464
  }
2465

2466
  ALWAYS_INLINE GSVector4(int x, int y)
2467
  {
2468
    v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0));
2469
  }
2470

2471
  ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); }
2472

2473
  ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { v4s = vcombine_f32(vcvt_f32_s32(v.v2s), vcreate_f32(0)); }
2474

2475
  ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {}
2476

2477
  ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); }
2478

2479
  ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); }
2480

2481
  ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
2482

2483
  ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
2484

2485
  ALWAYS_INLINE static GSVector4 f64(double x, double y)
2486
  {
2487
#ifdef CPU_ARCH_ARM64
2488
    return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
2489
#else
2490
    GSVector4 ret;
2491
    ret.F64[0] = x;
2492
    ret.F64[1] = y;
2493
    return ret;
2494
#endif
2495
  }
2496

2497
  ALWAYS_INLINE static GSVector4 f64(double x)
2498
  {
2499
#ifdef CPU_ARCH_ARM64
2500
    return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x)));
2501
#else
2502
    GSVector4 ret;
2503
    ret.F64[0] = ret.F64[1] = x;
2504
    return ret;
2505
#endif
2506
  }
2507

2508
  ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
2509

2510
  ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; }
2511

2512
  ALWAYS_INLINE operator float32x4_t() const { return v4s; }
2513

2514
  ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
2515

2516
  ALWAYS_INLINE static GSVector4 rgba32(u32 rgba)
2517
  {
2518
    return GSVector4(GSVector4i::zext32(static_cast<s32>(rgba)).u8to32());
2519
  }
2520

2521
  ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
2522

2523
  ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); }
2524

2525
  ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
2526

2527
#ifdef _M_ARM64
2528

2529
  ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
2530

2531
  ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
2532

2533
#else
2534

2535
  ALWAYS_INLINE GSVector4 floor() const
2536
  {
2537
    return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)),
2538
                     std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3)));
2539
  }
2540

2541
  ALWAYS_INLINE GSVector4 ceil() const
2542
  {
2543
    return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)),
2544
                     std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3)));
2545
  }
2546

2547
#endif
2548

2549
#ifdef CPU_ARCH_ARM64
2550

2551
  ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
2552

2553
  ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
2554

2555
  ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); }
2556

2557
  ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2558
  {
2559
    return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
2560
  }
2561

2562
#else
2563

2564
  ALWAYS_INLINE GSVector4 hadd() const
2565
  {
2566
    const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2567
    return GSVector4(vcombine_f32(res, res));
2568
  }
2569

2570
  ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const
2571
  {
2572
    const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
2573
    const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s));
2574
    return GSVector4(vcombine_f32(res1, res2));
2575
  }
2576

2577
  ALWAYS_INLINE GSVector4 hsub() const
2578
  {
2579
    const float32x4x2_t res = vuzpq_f32(v4s, v4s);
2580
    return GSVector4(vsubq_f32(res.val[0], res.val[0]));
2581
  }
2582

2583
  ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
2584
  {
2585
    const float32x4x2_t res = vuzpq_f32(v4s, v.v4s);
2586
    return GSVector4(vsubq_f32(res.val[0], res.val[1]));
2587
  }
2588

2589
#endif
2590

2591
  ALWAYS_INLINE float dot(const GSVector4& v) const
2592
  {
2593
#ifdef CPU_ARCH_ARM64
2594
    return vaddvq_f32(vmulq_f32(v4s, v.v4s));
2595
#else
2596
    const float32x4_t dp = vmulq_f32(v4s, v.v4s);
2597
    float32x2_t tmp = vadd_f32(vget_low_f32(dp), vget_high_f32(dp)); // (x+z, y+w)
2598
    return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2599
#endif
2600
  }
2601

2602
  ALWAYS_INLINE float addv() const
2603
  {
2604
#ifdef CPU_ARCH_ARM64
2605
    return vaddvq_f32(v4s);
2606
#else
2607
    float32x2_t tmp = vadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2608
    return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2609
#endif
2610
  }
2611

2612
  ALWAYS_INLINE float minv() const
2613
  {
2614
#ifdef CPU_ARCH_ARM64
2615
    return vminvq_f32(v4s);
2616
#else
2617
    float32x2_t tmp = vmin_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2618
    return vget_lane_f32(vmin_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2619
#endif
2620
  }
2621

2622
  ALWAYS_INLINE float maxv() const
2623
  {
2624
#ifdef CPU_ARCH_ARM64
2625
    return vmaxvq_f32(v4s);
2626
#else
2627
    float32x2_t tmp = vmax_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
2628
    return vget_lane_f32(vmax_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
2629
#endif
2630
  }
2631

2632
  ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
2633

2634
  ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
2635
  {
2636
#ifdef CPU_ARCH_ARM64
2637
    const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
2638
    const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
2639
#else
2640
    const GSVector4 minv(a.xyxy());
2641
    const GSVector4 maxv(a.zwzw());
2642
#endif
2643
    return sat(minv, maxv);
2644
  }
2645

2646
  ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
2647

2648
  ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
2649

2650
  ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); }
2651

2652
  ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); }
2653

2654
  template<int mask>
2655
  ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const
2656
  {
2657
    return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2,
2658
                                             (mask & 8) ? 7 : 3));
2659
  }
2660

2661
  ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
2662
  {
2663
    // duplicate sign bit across and bit select
2664
    const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
2665
    return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
2666
  }
2667

2668
#ifdef CPU_ARCH_ARM64
2669

2670
  ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
2671

2672
  ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
2673

2674
  ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2675
  {
2676
    return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2677
  }
2678

2679
  ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2680
  {
2681
    return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
2682
  }
2683

2684
#else
2685

2686
  ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const
2687
  {
2688
    const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s));
2689
    return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2690
  }
2691

2692
  ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const
2693
  {
2694
    const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s));
2695
    return GSVector4(vcombine_f32(res.val[0], res.val[1]));
2696
  }
2697

2698
  ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
2699
  {
2700
    return GSVector4(vreinterpretq_f32_s64(
2701
      vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s)))));
2702
  }
2703

2704
  ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
2705
  {
2706
    return GSVector4(vreinterpretq_f32_s64(
2707
      vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s)))));
2708
  }
2709

2710
#endif
2711

2712
  ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
2713
  {
2714
    return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
2715
  }
2716

2717
  ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
2718
  {
2719
    return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
2720
  }
2721

2722
  ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
2723
  {
2724
    return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
2725
  }
2726

2727
  ALWAYS_INLINE int mask() const
2728
  {
2729
#ifdef CPU_ARCH_ARM64
2730
    static constexpr const int32_t shifts[] = {0, 1, 2, 3};
2731
    return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
2732
#else
2733
    // sse2neon again
2734
    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31));
2735
    uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2736
    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2737
#endif
2738
  }
2739

2740
  ALWAYS_INLINE bool alltrue() const
2741
  {
2742
#ifdef CPU_ARCH_ARM64
2743
    return (vminvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0xFFFFFFFF));
2744
#else
2745

2746
    return (vget_lane_u64(vreinterpret_u64_u32(vand_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2747
                                                        vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2748
                          0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
2749
#endif
2750
  }
2751

2752
  ALWAYS_INLINE bool allfalse() const
2753
  {
2754
#ifdef CPU_ARCH_ARM64
2755
    return (vmaxvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0));
2756
#else
2757
    return (vget_lane_u64(vreinterpret_u64_u32(vorr_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
2758
                                                        vget_high_u32(vreinterpretq_u32_f32(v4s)))),
2759
                          0) == UINT64_C(0));
2760
#endif
2761
  }
2762

2763
  ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
2764

2765
  template<int src, int dst>
2766
  ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
2767
  {
2768
#ifdef CPU_ARCH_ARM64
2769
    return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
2770
#else
2771
    return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst));
2772
#endif
2773
  }
2774

2775
  template<int i>
2776
  ALWAYS_INLINE GSVector4 insert32(float v) const
2777
  {
2778
    return GSVector4(vsetq_lane_f32(v, v4s, i));
2779
  }
2780

2781
  template<int i>
2782
  ALWAYS_INLINE float extract32() const
2783
  {
2784
    return vgetq_lane_f32(v4s, i);
2785
  }
2786

2787
  template<int dst>
2788
  ALWAYS_INLINE GSVector4 insert64(double v) const
2789
  {
2790
#ifdef CPU_ARCH_ARM64
2791
    return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst)));
2792
#else
2793
    GSVector4 ret;
2794
    ret.F64[dst] = v;
2795
    return ret;
2796
#endif
2797
  }
2798

2799
  template<int src>
2800
  ALWAYS_INLINE double extract64() const
2801
  {
2802
#ifdef CPU_ARCH_ARM64
2803
    return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src);
2804
#else
2805
    return F64[src];
2806
#endif
2807
  }
2808

2809
  ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
2810

2811
  ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
2812

2813
  template<bool aligned>
2814
  ALWAYS_INLINE static GSVector4 loadl(const void* p)
2815
  {
2816
#ifdef CPU_ARCH_ARM32
2817
    if constexpr (!aligned)
2818
      return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
2819
#endif
2820

2821
    return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
2822
  }
2823

2824
  ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); }
2825

2826
  template<bool aligned>
2827
  ALWAYS_INLINE static GSVector4 load(const void* p)
2828
  {
2829
#ifdef CPU_ARCH_ARM32
2830
    if constexpr (!aligned)
2831
      return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
2832
#endif
2833

2834
    return GSVector4(vld1q_f32((const float*)p));
2835
  }
2836

2837
  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
2838

2839
  template<bool aligned>
2840
  ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
2841
  {
2842
#ifdef CPU_ARCH_ARM32
2843
    if constexpr (!aligned)
2844
    {
2845
      vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
2846
      return;
2847
    }
2848
#endif
2849

2850
    vst1_f32((float*)p, vget_low_f32(v.v4s));
2851
  }
2852

2853
  template<bool aligned>
2854
  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
2855
  {
2856
#ifdef CPU_ARCH_ARM32
2857
    if constexpr (!aligned)
2858
    {
2859
      vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
2860
      return;
2861
    }
2862
#endif
2863

2864
    vst1_f32((float*)p, vget_high_f32(v.v4s));
2865
  }
2866

2867
  template<bool aligned>
2868
  ALWAYS_INLINE static void store(void* p, const GSVector4& v)
2869
  {
2870
#ifdef CPU_ARCH_ARM32
2871
    if constexpr (!aligned)
2872
    {
2873
      vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
2874
      return;
2875
    }
2876
#endif
2877

2878
    vst1q_f32((float*)p, v.v4s);
2879
  }
2880

2881
  ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); }
2882

2883
  ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
2884

2885
  ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
2886
  ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
2887
  ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
2888
  ALWAYS_INLINE void operator/=(const GSVector4& v)
2889
  {
2890
#ifdef CPU_ARCH_ARM64
2891
    v4s = vdivq_f32(v4s, v.v4s);
2892
#else
2893
    *this =
2894
      GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1),
2895
                vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3));
2896
#endif
2897
  }
2898

2899
  ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
2900
  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
2901
  ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
2902
  ALWAYS_INLINE void operator/=(float f)
2903
  {
2904
#ifdef CPU_ARCH_ARM64
2905
    *this /= GSVector4(f);
2906
#else
2907
    *this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f,
2908
                      vgetq_lane_f32(v4s, 3) / f);
2909
#endif
2910
  }
2911

2912
  ALWAYS_INLINE void operator&=(const GSVector4& v)
2913
  {
2914
    v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2915
  }
2916

2917
  ALWAYS_INLINE void operator|=(const GSVector4& v)
2918
  {
2919
    v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2920
  }
2921

2922
  ALWAYS_INLINE void operator^=(const GSVector4& v)
2923
  {
2924
    v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
2925
  }
2926

2927
  ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
2928
  {
2929
    return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
2930
  }
2931

2932
  ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
2933
  {
2934
    return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
2935
  }
2936

2937
  ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
2938
  {
2939
    return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
2940
  }
2941

2942
  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
2943
  {
2944
#ifdef CPU_ARCH_ARM64
2945
    return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
2946
#else
2947
    return GSVector4(
2948
      vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1),
2949
      vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3));
2950
#endif
2951
  }
2952

2953
  ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
2954
  ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
2955
  ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
2956
  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f)
2957
  {
2958
#ifdef CPU_ARCH_ARM64
2959
    return v / GSVector4(f);
2960
#else
2961
    return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f,
2962
                     vgetq_lane_f32(v.v4s, 3) / f);
2963
#endif
2964
  }
2965

2966
  ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
2967
  {
2968
    return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2969
  }
2970

2971
  ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
2972
  {
2973
    return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2974
  }
2975

2976
  ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
2977
  {
2978
    return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
2979
  }
2980

2981
  ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
2982
  {
2983
    return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
2984
  }
2985

2986
  ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
2987
  {
2988
    // NEON has no !=
2989
    return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
2990
  }
2991

2992
  ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
2993
  {
2994
    return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
2995
  }
2996

2997
  ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
2998
  {
2999
    return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
3000
  }
3001

3002
  ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
3003
  {
3004
    return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
3005
  }
3006

3007
  ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
3008
  {
3009
    return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
3010
  }
3011

3012
  ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
3013
  {
3014
#ifdef CPU_ARCH_ARM64
3015
    return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3016
#else
3017
    return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]);
3018
#endif
3019
  }
3020

3021
  ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
3022
  {
3023
#ifdef CPU_ARCH_ARM64
3024
    return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3025
#else
3026
    return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]);
3027
#endif
3028
  }
3029

3030
  ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
3031
  {
3032
#ifdef CPU_ARCH_ARM64
3033
    return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3034
#else
3035
    return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]);
3036
#endif
3037
  }
3038

3039
  ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
3040
  {
3041
#ifdef CPU_ARCH_ARM64
3042
    return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3043
#else
3044
    return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
3045
#endif
3046
  }
3047

3048
  ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
3049
  {
3050
#ifdef CPU_ARCH_ARM64
3051
    return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3052
#else
3053
    GSVector4 ret;
3054
    ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3055
    ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3056
    return ret;
3057
#endif
3058
  }
3059

3060
  ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
3061
  {
3062
#ifdef CPU_ARCH_ARM64
3063
    return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3064
#else
3065
    GSVector4 ret;
3066
    ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3067
    ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3068
    return ret;
3069
#endif
3070
  }
3071

3072
  ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
3073
  {
3074
#ifdef CPU_ARCH_ARM64
3075
    return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3076
#else
3077
    GSVector4 ret;
3078
    ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3079
    ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3080
    return ret;
3081
#endif
3082
  }
3083

3084
  ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
3085
  {
3086
#ifdef CPU_ARCH_ARM64
3087
    return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3088
#else
3089
    GSVector4 ret;
3090
    ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3091
    ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3092
    return ret;
3093
#endif
3094
  }
3095

3096
  ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
3097
  {
3098
#ifdef CPU_ARCH_ARM64
3099
    return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3100
#else
3101
    GSVector4 ret;
3102
    ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3103
    ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
3104
    return ret;
3105
#endif
3106
  }
3107

3108
  ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
3109
  {
3110
#ifdef CPU_ARCH_ARM64
3111
    return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3112
#else
3113
    return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
3114
#endif
3115
  }
3116

3117
  ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
3118
  {
3119
#ifdef CPU_ARCH_ARM64
3120
    return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
3121
#else
3122
    return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
3123
#endif
3124
  }
3125

3126
  ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
3127

3128
  ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
3129

3130
  ALWAYS_INLINE GSVector4 sqrt64() const
3131
  {
3132
#ifdef CPU_ARCH_ARM64
3133
    return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3134
#else
3135
    return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1]));
3136
#endif
3137
  }
3138

3139
  ALWAYS_INLINE GSVector4 sqr64() const
3140
  {
3141
#ifdef CPU_ARCH_ARM64
3142
    return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
3143
#else
3144
    return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
3145
#endif
3146
  }
3147

3148
  ALWAYS_INLINE GSVector4 floor64() const
3149
  {
3150
#ifdef CPU_ARCH_ARM64
3151
    return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s))));
3152
#else
3153
    return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1]));
3154
#endif
3155
  }
3156

3157
  ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
3158
  {
3159
#ifdef CPU_ARCH_ARM64
3160
    return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
3161
#else
3162
    return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1)));
3163
#endif
3164
  }
3165

3166
  ALWAYS_INLINE static GSVector4 f32to64(const void* p)
3167
  {
3168
#ifdef CPU_ARCH_ARM64
3169
    return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
3170
#else
3171
    const float* fp = static_cast<const float*>(p);
3172
    return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1]));
3173
#endif
3174
  }
3175

3176
  ALWAYS_INLINE GSVector4i f64toi32() const
3177
  {
3178
#ifdef CPU_ARCH_ARM64
3179
    const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
3180
    const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
3181
#else
3182
    const s32 low = static_cast<s32>(F64[0]);
3183
    const s32 high = static_cast<s32>(F64[1]);
3184
#endif
3185
    return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
3186
  }
3187

3188
  ALWAYS_INLINE GSVector2 xy() const { return GSVector2(vget_low_s32(v4s)); }
3189

3190
  ALWAYS_INLINE GSVector2 zw() const { return GSVector2(vget_high_s32(v4s)); }
3191

3192
  ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l, const GSVector2& h)
3193
  {
3194
    return GSVector4(vcombine_f32(l.v2s, h.v2s));
3195
  }
3196

3197
  ALWAYS_INLINE static GSVector4 xyxy(const GSVector2& l) { return GSVector4(vcombine_f32(l.v2s, l.v2s)); }
3198

3199
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn)                                                              \
3200
  ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const                                                                       \
3201
  {                                                                                                                    \
3202
    return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn));                                               \
3203
  }                                                                                                                    \
3204
  ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const                                                     \
3205
  {                                                                                                                    \
3206
    return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn));                                     \
3207
  }
3208

3209
#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn)                                                                      \
3210
  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0);                                                                     \
3211
  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1);                                                                     \
3212
  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2);                                                                     \
3213
  VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3);
3214

3215
#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn)                                                                              \
3216
  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0);                                                                             \
3217
  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1);                                                                             \
3218
  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2);                                                                             \
3219
  VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3);
3220

3221
#define VECTOR4_SHUFFLE_1(xs, xn)                                                                                      \
3222
  VECTOR4_SHUFFLE_2(xs, xn, x, 0);                                                                                     \
3223
  VECTOR4_SHUFFLE_2(xs, xn, y, 1);                                                                                     \
3224
  VECTOR4_SHUFFLE_2(xs, xn, z, 2);                                                                                     \
3225
  VECTOR4_SHUFFLE_2(xs, xn, w, 3);
3226

3227
  VECTOR4_SHUFFLE_1(x, 0);
3228
  VECTOR4_SHUFFLE_1(y, 1);
3229
  VECTOR4_SHUFFLE_1(z, 2);
3230
  VECTOR4_SHUFFLE_1(w, 3);
3231

3232
#undef VECTOR4_SHUFFLE_1
3233
#undef VECTOR4_SHUFFLE_2
3234
#undef VECTOR4_SHUFFLE_3
3235
#undef VECTOR4_SHUFFLE_4
3236

3237
  ALWAYS_INLINE GSVector4 broadcast32() const
3238
  {
3239
#ifdef CPU_ARCH_ARM64
3240
    return GSVector4(vdupq_laneq_f32(v4s, 0));
3241
#else
3242
    return xxxx();
3243
#endif
3244
  }
3245

3246
  ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
3247
  {
3248
#ifdef CPU_ARCH_ARM64
3249
    return GSVector4(vdupq_laneq_f32(v.v4s, 0));
3250
#else
3251
    return v.xxxx();
3252
#endif
3253
  }
3254

3255
  ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
3256

3257
  ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
3258
  {
3259
#ifdef CPU_ARCH_ARM64
3260
    return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f)));
3261
#else
3262
    return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f)));
3263
#endif
3264
  }
3265
};
3266

3267
ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
3268
{
3269
  v2s = vcvt_s32_f32(v.v2s);
3270
}
3271

3272
ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
3273
{
3274
  v2s = vcvt_f32_s32(v.v2s);
3275
}
3276

3277
ALWAYS_INLINE GSVector2i GSVector2i::cast(const GSVector2& v)
3278
{
3279
  return GSVector2i(vreinterpret_s32_f32(v.v2s));
3280
}
3281

3282
ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
3283
{
3284
  return GSVector2(vreinterpret_f32_s32(v.v2s));
3285
}
3286

3287
ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
3288
{
3289
  v4s = vcvtq_s32_f32(v.v4s);
3290
}
3291

3292
ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
3293
{
3294
  v4s = vcvtq_f32_s32(v.v4s);
3295
}
3296

3297
ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
3298
{
3299
  return GSVector4i(vreinterpretq_s32_f32(v.v4s));
3300
}
3301

3302
ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
3303
{
3304
  return GSVector4(vreinterpretq_f32_s32(v.v4s));
3305
}
3306

3307
Product

Resources

Company