Nebula
Loading...
Searching...
No Matches
simd.h
Go to the documentation of this file.
1#pragma once
2//------------------------------------------------------------------------------
10
11#if NEBULA_SIMD_X64
12#include <xmmintrin.h>
13#include <emmintrin.h>
14#include <smmintrin.h>
15#include <immintrin.h>
16typedef __m128 f32x4;
17typedef __m128i i32x4;
18typedef __m128i u32x4;
19
20f32x4 cast_i32x4_to_f32x4(i32x4 x);
21i32x4 set_i32x4(int32_t x, int32_t y, int32_t z, int32_t w);
22static const f32x4 _mask_xyz = cast_i32x4_to_f32x4(set_i32x4( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 ));
23
24//------------------------------------------------------------------------------
27__forceinline f32x4
28set_f32x4(float x, float y, float z, float w)
29{
30 return _mm_setr_ps(x,y,z,w);
31}
32
33//------------------------------------------------------------------------------
36__forceinline i32x4
37set_i32x4(int32_t x, int32_t y, int32_t z, int32_t w)
38{
39 return _mm_setr_epi32(x,y,z,w);
40}
41
42//------------------------------------------------------------------------------
45__forceinline f32x4
46splat_f32x4(float x)
47{
48 return _mm_set1_ps(x);
49}
50
51//------------------------------------------------------------------------------
54__forceinline f32x4
55cast_i32x4_to_f32x4(i32x4 x)
56{
57 return _mm_castsi128_ps(x);
58}
59
60//------------------------------------------------------------------------------
63__forceinline f32x4
64set_last_f32x4(f32x4 v, float val)
65{
66 f32x4 vec = _mm_set_ss(val);
67 return _mm_insert_ps(v, vec, 0b00110000);
68}
69
70//------------------------------------------------------------------------------
73__forceinline u32x4
74compare_equal_f32x4(f32x4 a, f32x4 b)
75{
76 return _mm_castps_si128(_mm_cmpeq_ps(a, b));
77}
78
79//------------------------------------------------------------------------------
82__forceinline u32x4
83compare_greater_equal_f32x4(f32x4 a, f32x4 b)
84{
85 return _mm_castps_si128(_mm_cmpge_ps(a, b));
86}
87
88//------------------------------------------------------------------------------
91__forceinline u32x4
92compare_greater_f32x4(f32x4 a, f32x4 b)
93{
94 return _mm_castps_si128(_mm_cmpgt_ps(a, b));
95}
96
97//------------------------------------------------------------------------------
100__forceinline u32x4
101compare_less_equal_f32x4(f32x4 a, f32x4 b)
102{
103 return _mm_castps_si128(_mm_cmple_ps(a, b));
104}
105
106//------------------------------------------------------------------------------
109__forceinline u32x4
110compare_less_f32x4(f32x4 a, f32x4 b)
111{
112 return _mm_castps_si128(_mm_cmplt_ps(a, b));
113}
114
115//------------------------------------------------------------------------------
118__forceinline bool
119all_u32x4(u32x4 a)
120{
121 return _mm_movemask_epi8(a) == 0xF;
122}
123
124//------------------------------------------------------------------------------
127__forceinline bool
128all_u32x3(u32x4 a)
129{
130 return _mm_movemask_epi8(a) == 0x7;
131}
132
133//------------------------------------------------------------------------------
136__forceinline bool
137any_u32x4(u32x4 a)
138{
139 return _mm_movemask_epi8(a) != 0x0;
140}
141
142//------------------------------------------------------------------------------
145__forceinline bool
146any_u32x3(u32x4 a)
147{
148 return (_mm_movemask_epi8(a) & 0x7) != 0x0;
149}
150
151//------------------------------------------------------------------------------
154__forceinline f32x4
155load_unaligned_f32x3(const float* ptr)
156{
157 f32x4 vec = _mm_loadu_ps(ptr);
158 return _mm_and_ps(vec, _mask_xyz);
159}
160
161//------------------------------------------------------------------------------
164__forceinline f32x4
165load_aligned_f32x3(const float* ptr)
166{
167 f32x4 vec = _mm_load_ps(ptr);
168 return _mm_and_ps(vec, _mask_xyz);
169}
170
171//------------------------------------------------------------------------------
174__forceinline void
175store_f32x3(f32x4 vec, float* ptr)
176{
177 f32x4 t1 = _mm_permute_ps(vec, _MM_SHUFFLE(1, 1, 1, 1));
178 f32x4 t2 = _mm_permute_ps(vec, _MM_SHUFFLE(2, 2, 2, 2));
179 _mm_store_ss(&ptr[0], vec);
180 _mm_store_ss(&ptr[1], t1);
181 _mm_store_ss(&ptr[2], t2);
182}
183
184//------------------------------------------------------------------------------
187__forceinline void
188store_f32x4(f32x4 vec, float* ptr)
189{
190 _mm_store_ps(ptr, vec);
191}
192
193//------------------------------------------------------------------------------
196__forceinline void
197store_f32(f32x4 vec, float* ptr)
198{
199 _mm_store_ss(ptr, vec);
200}
201
202//------------------------------------------------------------------------------
205__forceinline f32x4
206flip_sign_f32x4(f32x4 vec)
207{
208 static const __m128i _sign = _mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
209 return _mm_xor_ps(_mm_castsi128_ps(_sign), vec);
210}
211
212//------------------------------------------------------------------------------
215__forceinline f32x4
216mul_f32x4(f32x4 a, f32x4 b)
217{
218 return _mm_mul_ps(a, b);
219}
220
221//------------------------------------------------------------------------------
224__forceinline f32x4
225mul_first_f32x4(f32x4 a, f32x4 b)
226{
227 return _mm_mul_ss(a, b);
228}
229
230//------------------------------------------------------------------------------
233__forceinline f32x4
234fma_f32x4(f32x4 a, f32x4 b, f32x4 c)
235{
236#if NEBULA_MATH_FMA
237 return _mm_fmadd_ps(a, b, c);
238#else
239 return _mm_add_ps(_mm_mul_ps(a, b),c);
240#endif
241}
242
243//------------------------------------------------------------------------------
246__forceinline f32x4
247div_f32x4(f32x4 a, f32x4 b)
248{
249 return _mm_div_ps(a, b);
250}
251
252//------------------------------------------------------------------------------
255__forceinline f32x4
256add_f32x4(f32x4 a, f32x4 b)
257{
258 return _mm_add_ps(a, b);
259}
260
261//------------------------------------------------------------------------------
264__forceinline f32x4
265sub_f32x4(f32x4 a, f32x4 b)
266{
267 return _mm_sub_ps(a, b);
268}
269
270//------------------------------------------------------------------------------
273__forceinline f32x4
274dot_f32x4(f32x4 a, f32x4 b)
275{
276 return _mm_dp_ps(a, b, 0xFF);
277}
278
279//------------------------------------------------------------------------------
282__forceinline float
283dot_f32x3(f32x4 a, f32x4 b)
284{
285 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x71));
286}
287
288//------------------------------------------------------------------------------
291__forceinline f32x4
292abs_f32x4(f32x4 a)
293{
294 unsigned int val = 0x7fffffff;
295 f32x4 temp = _mm_set1_ps(*(float*)&val);
296 return _mm_and_ps(a, temp);
297}
298
299//------------------------------------------------------------------------------
302__forceinline float
303get_first_f32x4(f32x4 a)
304{
305 return _mm_cvtss_f32(a);
306}
307
308//------------------------------------------------------------------------------
311__forceinline f32x4
312rcp_f32x4(f32x4 a)
313{
314 return _mm_rcp_ps(a);
315}
316
317
318//------------------------------------------------------------------------------
322__forceinline f32x4
323rsqrt_f32x4(f32x4 a)
324{
325 return _mm_rsqrt_ps(a);
326}
327
328//------------------------------------------------------------------------------
332__forceinline f32x4
333sqrt_f32x4(f32x4 a)
334{
335 return _mm_sqrt_ps(a);
336}
337
338//------------------------------------------------------------------------------
341__forceinline f32x4
342max_f32x4(f32x4 a, f32x4 b)
343{
344 return _mm_max_ps(a, b);
345}
346
347//------------------------------------------------------------------------------
350__forceinline f32x4
351max_first_f32x4(f32x4 a, f32x4 b)
352{
353 return _mm_max_ss(a, b);
354}
355
356//------------------------------------------------------------------------------
359__forceinline f32x4
360min_f32x4(f32x4 a, f32x4 b)
361{
362 return _mm_min_ps(a, b);
363}
364
365//------------------------------------------------------------------------------
368__forceinline f32x4
369min_first_f32x4(f32x4 a, f32x4 b)
370{
371 return _mm_min_ss(a, b);
372}
373
374//------------------------------------------------------------------------------
377#define shuffle_f32x4(a, b, a0, a1, b0, b1) (_mm_shuffle_ps(a, b, _MM_SHUFFLE(a0, a1, b0, b1)))
378//__forceinline f32x4
379//shuffle_f32x4(f32x4 a, f32x4 b, uint8_t a0, uint8_t a1, uint8_t b0, uint8_t b1)
380//{
381// return _mm_shuffle_ps(a, b, _MM_SHUFFLE(b1, b0, a1, a0));
382//}
383
384//------------------------------------------------------------------------------
387__forceinline f32x4
388convert_u32x4_to_f32x4(u32x4 a)
389{
390 // assumes u32x4 is never anything but a comparison
391 return _mm_castsi128_ps(a);
392}
393
394#elif NEBULA_SIMD_AARCH64
395#include <arm_neon.h>
396typedef float32x4_t f32x4;
397typedef int32x4_t i32x4;
398typedef uint32x4_t u32x4;
399
400//------------------------------------------------------------------------------
403__forceinline f32x4
404set_f32x4(x, y, z, w)
405{
406 return f32x4{x,y,z,w}
407}
408
409//------------------------------------------------------------------------------
412__forceinline i32x4
413set_i32x4(x, y, z, w)
414{
415 return i32x4{x,y,z,w}
416}
417
418//------------------------------------------------------------------------------
421__forceinline f32x4
422splat_f32x4(float x)
423{
424 return vdupq_n_f32(x);
425}
426
427//------------------------------------------------------------------------------
430__forceinline f32x4
431cast_i32x4_to_f32x4(i32x4 x)
432{
433 return vreinterpretq_s32_f32(x);
434}
435
436//------------------------------------------------------------------------------
439__forceinline f32x4
440set_last_f32x4(f32x4 v, float val)
441{
442 return vsetq_lane_f32(val, v, 3);
443}
444
445//------------------------------------------------------------------------------
448__forceinline u32x4
449compare_equal_f32x4(f32x4 a, f32x4 b)
450{
451 return vceqq_f32(a, b);
452}
453
454//------------------------------------------------------------------------------
457__forceinline u32x4
458compare_greater_equal_f32x4(f32x4 a, f32x4 b)
459{
460 return vcgeq_f32(a, b);
461}
462
463//------------------------------------------------------------------------------
466__forceinline u32x4
467compare_greater_f32x4(f32x4 a, f32x4 b)
468{
469 return vcgtq_f32(a, b);
470}
471
472//------------------------------------------------------------------------------
475__forceinline u32x4
476compare_less_equal_f32x4(f32x4 a, f32x4 b)
477{
478 return vcleq_f32(a, b);
479}
480
481//------------------------------------------------------------------------------
484__forceinline u32x4
485compare_less_f32x4(f32x4 a, f32x4 b)
486{
487 return vcltq_f32(a, b);
488}
489
490//------------------------------------------------------------------------------
493__forceinline bool
494all_u32x4(u32x4 cmp)
495{
496 uint32x2_t low = vget_low_u32(cmp);
497 uint32x2_t high = vget_high_u32(cmp);
498
499 uint32x2_t and1 = vand_u32(low, high);
500 uint32_t res = vget_lane_u32(and1, 0) & vget_lane_u32(and1, 1);
501
502 return res == 0xFFFFFFFF;
503}
504
505//------------------------------------------------------------------------------
508__forceinline bool
509all_u32x3(u32x4 cmp)
510{
511 uint32x2_t low = vget_low_u32(cmp);
512 uint32x2_t high = vget_high_u32(cmp);
513
514 uint32x2_t and1 = vand_u32(low, vdup_n_u32(vgetq_lane_u32(cmp, 2)));
515 uint32_t res = vget_lane_u32(and1, 0) & vget_lane_u32(and1, 1);
516
517 return res == 0xFFFFFFFF;
518}
519
520//------------------------------------------------------------------------------
523__forceinline bool
524any_u32x4(u32x4 cmp)
525{
526 uint32x2_t low = vget_low_u32(cmp);
527 uint32x2_t high = vget_high_u32(cmp);
528
529 uint32x2_t and1 = vand_u32(low, high);
530 uint32_t res = vget_lane_u32(and1, 0) & vget_lane_u32(and1, 1);
531
532 return res != 0x0;
533}
534
535//------------------------------------------------------------------------------
538__forceinline bool
539any_u32x3(u32x4 cmp)
540{
541 uint32x2_t low = vget_low_u32(cmp);
542 uint32x2_t high = vget_high_u32(cmp);
543
544 uint32x2_t and1 = vand_u32(low, vdup_n_u32(vgetq_lane_u32(cmp, 2)));
545 uint32_t res = vget_lane_u32(and1, 0) & vget_lane_u32(and1, 1);
546
547 return res == 0x0;
548}
549
550
551//------------------------------------------------------------------------------
554__forceinline f32x4
555load_unaligned_f32x3(const scalar* ptr)
556{
557 f32x4 vec = vld1q_f32(ptr);
558 return set_last_f32x4(vec, 0);
559}
560
561//------------------------------------------------------------------------------
564__forceinline f32x4
565load_aligned_f32x3(const scalar* ptr)
566{
567 f32x4 vec = vld1q_f32(ptr);
568 return set_last_f32x4(vec, 0);
569}
570
571//------------------------------------------------------------------------------
574__forceinline void
575store_f32x3(f32x4 vec, scalar* ptr)
576{
577 ptr[0] = vgetq_lane_f32(vec, 0);
578 ptr[1] = vgetq_lane_f32(vec, 1);
579 ptr[2] = vgetq_lane_f32(vec, 2);
580}
581
582//------------------------------------------------------------------------------
585__forceinline void
586store_f32x4(f32x4 vec, scalar* ptr)
587{
588 vst1q_f32(vec, ptr);
589}
590
591//------------------------------------------------------------------------------
594__forceinline void
595store_f32(f32x4 vec, scalar* ptr)
596{
597 ptr[0] = vgetq_lane_f32(vec, 0);
598}
599
600//------------------------------------------------------------------------------
603__forceinline f32x4
604flip_sign_f32x4(f32x4 vec)
605{
606 static const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
607 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(vec), sign_mask));
608}
609
610//------------------------------------------------------------------------------
613__forceinline f32x4
614mul_f32x4(f32x4 a, f32x4 b)
615{
616 return vmulq_f32(a, b);
617}
618
619//------------------------------------------------------------------------------
622__forceinline f32x4
623mul_first_f32x4(f32x4 a, f32x4 b)
624{
625 float first = vget_lane_f32(a, 0) * vget_lane_f32(b, 0);
626 return vsetq_lane_f32(first, a, 0);
627}
628
629//------------------------------------------------------------------------------
632__forceinline f32x4
633fma_f32x4(f32x4 a, f32x4 b, f32x4 c)
634{
635 return vmlaq_f32(a, b, c);
636}
637
638//------------------------------------------------------------------------------
641__forceinline f32x4
642mad_f32x4(f32x4 a, f32x4 b, f32x4 c)
643{
644 return vmlaq_f32(a, b, c);
645}
646
647//------------------------------------------------------------------------------
650__forceinline f32x4
651div_f32x4(f32x4 a, f32x4 b)
652{
653 f32x4 recip = vrecpeq_f32(b);
654 recip = vmulq_f32(recip, vrecpsq_f32(denominator, recip));
655 recip = vmulq_f32(recip, vrecpsq_f32(denominator, recip));
656 return vmulq_f32(a, recip);
657}
658
659//------------------------------------------------------------------------------
662__forceinline f32x4
663add_f32x4(f32x4 a, f32x4 b)
664{
665 return vaddq_f32(this->vec, rhs.vec);
666}
667
668//------------------------------------------------------------------------------
671__forceinline f32x4
672sub_f32x4(f32x4 a, f32x4 b)
673{
674 return vsubq_f32(a, b);
675}
676
677//------------------------------------------------------------------------------
680__forceinline f32x4
681dot_f32x4(f32x4 a, f32x4 b)
682{
683 f32x4 prod = vmulq(a, b);
684 float32x2_t sum2 = vadd_f32(vget_low_f32(prod), vget_high_f32(prod));
685 return vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
686}
687
688//------------------------------------------------------------------------------
691__forceinline scalar
692dot_f32x3(f32x4 a, f32x4 b)
693{
694 f32x4 prod = vmulq(a, b);
695 float32x2_t low = vget_low_f32(prod); // get 0, 1
696 float32x2_t sum2 = vpadd_f32(low, vdup_n_f32(vgetq_lane_f32(prod, 2))); // Add 0, 1 with splat of 2
697 return vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
698}
699
700//------------------------------------------------------------------------------
703__forceinline f32x4
704abs_f32x4(f32x4 a)
705{
706 return vabsq_f32(a);
707}
708
709//------------------------------------------------------------------------------
712__forceinline scalar
713get_first_f32x4(f32x4 a)
714{
715 return vget_lane_f32(a, 0);
716}
717
718//------------------------------------------------------------------------------
721__forceinline f32x4
722rcp_f32x4(f32x4 a)
723{
724 return vrecpeq_f32(a);
725}
726
727//------------------------------------------------------------------------------
731__forceinline f32x4
732rsqrt_f32x4(f32x4 a)
733{
734 f32x4 step = vrsqrteq_f32(a);
735 step = vmulq_f32(step, vrsqrtsq_f32(a, step));
736 step = vmulq_f32(step, vrsqrtsq_f32(a, step));
737 return step;
738}
739
740//------------------------------------------------------------------------------
744__forceinline f32x4
745sqrt_f32x4(f32x4 a)
746{
747 f32x4 rsqrt = rsqrt_f32x4(a);
748 return vmulq_f32(a, rsqrt);
749}
750
751//------------------------------------------------------------------------------
754__forceinline f32x4
755max_f32x4(f32x4 a, f32x4 b)
756{
757 return vmaxq_f32(a, b);
758}
759
760//------------------------------------------------------------------------------
763__forceinline f32x4
764max_first_f32x4(f32x4 a, f32x4 b)
765{
766 float a0 = vget_lane_f32(a, 0);
767 float b0 = vget_lane_f32(b, 0);
768 float largest = a0 < b0 ? b0 : a0;
769 return vsetq_lane_f32(largest, a, 0);
770}
771
772//------------------------------------------------------------------------------
775__forceinline f32x4
776min_f32x4(f32x4 a, f32x4 b)
777{
778 return vminq_f32(a, b);
779}
780
781//------------------------------------------------------------------------------
784__forceinline f32x4
785min_first_f32x4(f32x4 a, f32x4 b)
786{
787 float a0 = vget_lane_f32(a, 0);
788 float b0 = vget_lane_f32(b, 0);
789 float smallest = a0 > b0 ? b0 : a0;
790 return vsetq_lane_f32(smallest, a, 0);
791}
792
793//------------------------------------------------------------------------------
796__forceinline f32x4
797shuffle_f32x4(f32x4 a, f32x4 b, uint8_t a0, uint8_t a1, uint8_t b0, uint8_t b1)
798{
799 return f32x4{
800 vget_lane_f32(a, a0)
801 , vget_lane_f32(a, a1)
802 , vget_lane_f32(b, b0)
803 , vget_lane_f32(b, b1)
804 }
805}
806
807//------------------------------------------------------------------------------
810__forceinline f32x4
811convert_u32x4_to_f32x4(u32x4 a)
812{
813 return vcvtq_f32_u32(a);
814}
815
816#endif
__forceinline __m128 recip(__m128 a)
Definition sse.h:55
float scalar
Definition scalar.h:45