20f32x4 cast_i32x4_to_f32x4(i32x4 x);
21i32x4 set_i32x4(int32_t x, int32_t y, int32_t z, int32_t w);
22static const f32x4 _mask_xyz = cast_i32x4_to_f32x4(set_i32x4( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 ));
28set_f32x4(
float x,
float y,
float z,
float w)
30 return _mm_setr_ps(x,y,z,w);
37set_i32x4(int32_t x, int32_t y, int32_t z, int32_t w)
39 return _mm_setr_epi32(x,y,z,w);
48 return _mm_set1_ps(x);
55cast_i32x4_to_f32x4(i32x4 x)
57 return _mm_castsi128_ps(x);
64set_last_f32x4(f32x4 v,
float val)
66 f32x4 vec = _mm_set_ss(val);
67 return _mm_insert_ps(v, vec, 0b00110000);
74compare_equal_f32x4(f32x4 a, f32x4 b)
76 return _mm_castps_si128(_mm_cmpeq_ps(a, b));
83compare_greater_equal_f32x4(f32x4 a, f32x4 b)
85 return _mm_castps_si128(_mm_cmpge_ps(a, b));
92compare_greater_f32x4(f32x4 a, f32x4 b)
94 return _mm_castps_si128(_mm_cmpgt_ps(a, b));
101compare_less_equal_f32x4(f32x4 a, f32x4 b)
103 return _mm_castps_si128(_mm_cmple_ps(a, b));
110compare_less_f32x4(f32x4 a, f32x4 b)
112 return _mm_castps_si128(_mm_cmplt_ps(a, b));
121 return _mm_movemask_epi8(a) == 0xF;
130 return _mm_movemask_epi8(a) == 0x7;
139 return _mm_movemask_epi8(a) != 0x0;
148 return (_mm_movemask_epi8(a) & 0x7) != 0x0;
155load_unaligned_f32x3(
const float* ptr)
157 f32x4 vec = _mm_loadu_ps(ptr);
158 return _mm_and_ps(vec, _mask_xyz);
165load_aligned_f32x3(
const float* ptr)
167 f32x4 vec = _mm_load_ps(ptr);
168 return _mm_and_ps(vec, _mask_xyz);
175store_f32x3(f32x4 vec,
float* ptr)
177 f32x4 t1 = _mm_permute_ps(vec, _MM_SHUFFLE(1, 1, 1, 1));
178 f32x4 t2 = _mm_permute_ps(vec, _MM_SHUFFLE(2, 2, 2, 2));
179 _mm_store_ss(&ptr[0], vec);
180 _mm_store_ss(&ptr[1], t1);
181 _mm_store_ss(&ptr[2], t2);
188store_f32x4(f32x4 vec,
float* ptr)
190 _mm_store_ps(ptr, vec);
197store_f32(f32x4 vec,
float* ptr)
199 _mm_store_ss(ptr, vec);
206flip_sign_f32x4(f32x4 vec)
208 static const __m128i _sign = _mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
209 return _mm_xor_ps(_mm_castsi128_ps(_sign), vec);
216mul_f32x4(f32x4 a, f32x4 b)
218 return _mm_mul_ps(a, b);
225mul_first_f32x4(f32x4 a, f32x4 b)
227 return _mm_mul_ss(a, b);
234fma_f32x4(f32x4 a, f32x4 b, f32x4 c)
237 return _mm_fmadd_ps(a, b, c);
239 return _mm_add_ps(_mm_mul_ps(a, b),c);
247div_f32x4(f32x4 a, f32x4 b)
249 return _mm_div_ps(a, b);
256add_f32x4(f32x4 a, f32x4 b)
258 return _mm_add_ps(a, b);
265sub_f32x4(f32x4 a, f32x4 b)
267 return _mm_sub_ps(a, b);
274dot_f32x4(f32x4 a, f32x4 b)
276 return _mm_dp_ps(a, b, 0xFF);
283dot_f32x3(f32x4 a, f32x4 b)
285 return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x71));
294 unsigned int val = 0x7fffffff;
295 f32x4 temp = _mm_set1_ps(*(
float*)&val);
296 return _mm_and_ps(a, temp);
303get_first_f32x4(f32x4 a)
305 return _mm_cvtss_f32(a);
314 return _mm_rcp_ps(a);
325 return _mm_rsqrt_ps(a);
335 return _mm_sqrt_ps(a);
342max_f32x4(f32x4 a, f32x4 b)
344 return _mm_max_ps(a, b);
351max_first_f32x4(f32x4 a, f32x4 b)
353 return _mm_max_ss(a, b);
360min_f32x4(f32x4 a, f32x4 b)
362 return _mm_min_ps(a, b);
369min_first_f32x4(f32x4 a, f32x4 b)
371 return _mm_min_ss(a, b);
377#define shuffle_f32x4(a, b, a0, a1, b0, b1) (_mm_shuffle_ps(a, b, _MM_SHUFFLE(a0, a1, b0, b1)))
388convert_u32x4_to_f32x4(u32x4 a)
391 return _mm_castsi128_ps(a);
394#elif NEBULA_SIMD_AARCH64
396typedef float32x4_t f32x4;
397typedef int32x4_t i32x4;
398typedef uint32x4_t u32x4;
406 return f32x4{x,y,z,w}
415 return i32x4{x,y,z,w}
424 return vdupq_n_f32(x);
431cast_i32x4_to_f32x4(i32x4 x)
433 return vreinterpretq_s32_f32(x);
440set_last_f32x4(f32x4 v,
float val)
442 return vsetq_lane_f32(val, v, 3);
449compare_equal_f32x4(f32x4 a, f32x4 b)
451 return vceqq_f32(a, b);
458compare_greater_equal_f32x4(f32x4 a, f32x4 b)
460 return vcgeq_f32(a, b);
467compare_greater_f32x4(f32x4 a, f32x4 b)
469 return vcgtq_f32(a, b);
476compare_less_equal_f32x4(f32x4 a, f32x4 b)
478 return vcleq_f32(a, b);
485compare_less_f32x4(f32x4 a, f32x4 b)
487 return vcltq_f32(a, b);
496 uint32x2_t low = vget_low_u32(cmp);
497 uint32x2_t high = vget_high_u32(cmp);
499 uint32x2_t and1 = vand_u32(low, high);
500 uint32_t res = vget_lane_u32(and1, 0) & vget_lane_u32(and1, 1);
502 return res == 0xFFFFFFFF;
511 uint32x2_t low = vget_low_u32(cmp);
512 uint32x2_t high = vget_high_u32(cmp);
514 uint32x2_t and1 = vand_u32(low, vdup_n_u32(vgetq_lane_u32(cmp, 2)));
515 uint32_t res = vget_lane_u32(and1, 0) & vget_lane_u32(and1, 1);
517 return res == 0xFFFFFFFF;
526 uint32x2_t low = vget_low_u32(cmp);
527 uint32x2_t high = vget_high_u32(cmp);
529 uint32x2_t and1 = vand_u32(low, high);
530 uint32_t res = vget_lane_u32(and1, 0) & vget_lane_u32(and1, 1);
541 uint32x2_t low = vget_low_u32(cmp);
542 uint32x2_t high = vget_high_u32(cmp);
544 uint32x2_t and1 = vand_u32(low, vdup_n_u32(vgetq_lane_u32(cmp, 2)));
545 uint32_t res = vget_lane_u32(and1, 0) & vget_lane_u32(and1, 1);
555load_unaligned_f32x3(
const scalar* ptr)
557 f32x4 vec = vld1q_f32(ptr);
558 return set_last_f32x4(vec, 0);
565load_aligned_f32x3(
const scalar* ptr)
567 f32x4 vec = vld1q_f32(ptr);
568 return set_last_f32x4(vec, 0);
575store_f32x3(f32x4 vec, scalar* ptr)
577 ptr[0] = vgetq_lane_f32(vec, 0);
578 ptr[1] = vgetq_lane_f32(vec, 1);
579 ptr[2] = vgetq_lane_f32(vec, 2);
586store_f32x4(f32x4 vec, scalar* ptr)
595store_f32(f32x4 vec, scalar* ptr)
597 ptr[0] = vgetq_lane_f32(vec, 0);
604flip_sign_f32x4(f32x4 vec)
606 static const uint32x4_t sign_mask = vdupq_n_u32(0x80000000);
607 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(vec), sign_mask));
614mul_f32x4(f32x4 a, f32x4 b)
616 return vmulq_f32(a, b);
623mul_first_f32x4(f32x4 a, f32x4 b)
625 float first = vget_lane_f32(a, 0) * vget_lane_f32(b, 0);
626 return vsetq_lane_f32(first, a, 0);
633fma_f32x4(f32x4 a, f32x4 b, f32x4 c)
635 return vmlaq_f32(a, b, c);
642mad_f32x4(f32x4 a, f32x4 b, f32x4 c)
644 return vmlaq_f32(a, b, c);
651div_f32x4(f32x4 a, f32x4 b)
653 f32x4
recip = vrecpeq_f32(b);
654 recip = vmulq_f32(recip, vrecpsq_f32(denominator, recip));
655 recip = vmulq_f32(recip, vrecpsq_f32(denominator, recip));
656 return vmulq_f32(a, recip);
663add_f32x4(f32x4 a, f32x4 b)
665 return vaddq_f32(this->vec, rhs.vec);
672sub_f32x4(f32x4 a, f32x4 b)
674 return vsubq_f32(a, b);
681dot_f32x4(f32x4 a, f32x4 b)
683 f32x4 prod = vmulq(a, b);
684 float32x2_t sum2 = vadd_f32(vget_low_f32(prod), vget_high_f32(prod));
685 return vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
692dot_f32x3(f32x4 a, f32x4 b)
694 f32x4 prod = vmulq(a, b);
695 float32x2_t low = vget_low_f32(prod);
696 float32x2_t sum2 = vpadd_f32(low, vdup_n_f32(vgetq_lane_f32(prod, 2)));
697 return vget_lane_f32(sum2, 0) + vget_lane_f32(sum2, 1);
713get_first_f32x4(f32x4 a)
715 return vget_lane_f32(a, 0);
724 return vrecpeq_f32(a);
734 f32x4 step = vrsqrteq_f32(a);
735 step = vmulq_f32(step, vrsqrtsq_f32(a, step));
736 step = vmulq_f32(step, vrsqrtsq_f32(a, step));
747 f32x4 rsqrt = rsqrt_f32x4(a);
748 return vmulq_f32(a, rsqrt);
755max_f32x4(f32x4 a, f32x4 b)
757 return vmaxq_f32(a, b);
764max_first_f32x4(f32x4 a, f32x4 b)
766 float a0 = vget_lane_f32(a, 0);
767 float b0 = vget_lane_f32(b, 0);
768 float largest = a0 < b0 ? b0 : a0;
769 return vsetq_lane_f32(largest, a, 0);
776min_f32x4(f32x4 a, f32x4 b)
778 return vminq_f32(a, b);
785min_first_f32x4(f32x4 a, f32x4 b)
787 float a0 = vget_lane_f32(a, 0);
788 float b0 = vget_lane_f32(b, 0);
789 float smallest = a0 > b0 ? b0 : a0;
790 return vsetq_lane_f32(smallest, a, 0);
797shuffle_f32x4(f32x4 a, f32x4 b, uint8_t a0, uint8_t a1, uint8_t b0, uint8_t b1)
801 , vget_lane_f32(a, a1)
802 , vget_lane_f32(b, b0)
803 , vget_lane_f32(b, b1)
811convert_u32x4_to_f32x4(u32x4 a)
813 return vcvtq_f32_u32(a);
__forceinline __m128 recip(__m128 a)
Definition sse.h:55
float scalar
Definition scalar.h:45