25 #error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead." 28 #ifndef __AVX512VLDQINTRIN_H 29 #define __AVX512VLDQINTRIN_H 32 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"))) 36 return (__m256i) ((__v4du) __A * (__v4du) __B);
41 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
48 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__U,
55 return (__m128i) ((__v2du) __A * (__v2du) __B);
60 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
67 return (__m128i)__builtin_ia32_selectq_128((
__mmask8)__U,
74 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
81 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
88 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
95 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
102 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
109 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
116 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
123 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
130 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
137 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
144 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
151 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
158 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
165 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
172 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
179 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
186 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
193 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
200 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
207 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
214 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
221 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
228 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
235 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
242 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
249 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__U,
256 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
263 return (__m128d)__builtin_ia32_selectpd_128((
__mmask8)__U,
270 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
277 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__U,
284 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
291 return (__m128)__builtin_ia32_selectps_128((
__mmask8)__U,
298 return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
305 return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
312 return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
319 return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
326 return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
333 return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
340 return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
347 return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
354 return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
361 return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
368 return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
375 return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
382 return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
389 return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
396 return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
403 return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
410 return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
417 return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
424 return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
431 return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
438 return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
445 return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
452 return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
459 return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
466 return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
473 return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
480 return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
487 return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
494 return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
501 return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
508 return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
515 return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
522 return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
529 return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
536 return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
543 return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
550 return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
557 return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
564 return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
571 return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
578 return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
585 return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
592 return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
599 return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
606 return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
613 return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
620 return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
627 return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
634 return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
641 return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
648 return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
655 return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
662 return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
669 return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
676 return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
683 return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
690 return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
697 return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
704 return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
711 return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
718 return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
725 return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
732 return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
739 return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
746 return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
753 return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
760 return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
767 return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
774 return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
781 return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
788 return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
795 return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
800 #define _mm_range_pd(A, B, C) __extension__ ({ \ 801 (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ 802 (__v2df)(__m128d)(B), (int)(C), \ 803 (__v2df)_mm_setzero_pd(), \ 806 #define _mm_mask_range_pd(W, U, A, B, C) __extension__ ({ \ 807 (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ 808 (__v2df)(__m128d)(B), (int)(C), \ 809 (__v2df)(__m128d)(W), \ 812 #define _mm_maskz_range_pd(U, A, B, C) __extension__ ({ \ 813 (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ 814 (__v2df)(__m128d)(B), (int)(C), \ 815 (__v2df)_mm_setzero_pd(), \ 818 #define _mm256_range_pd(A, B, C) __extension__ ({ \ 819 (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ 820 (__v4df)(__m256d)(B), (int)(C), \ 821 (__v4df)_mm256_setzero_pd(), \ 824 #define _mm256_mask_range_pd(W, U, A, B, C) __extension__ ({ \ 825 (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ 826 (__v4df)(__m256d)(B), (int)(C), \ 827 (__v4df)(__m256d)(W), \ 830 #define _mm256_maskz_range_pd(U, A, B, C) __extension__ ({ \ 831 (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ 832 (__v4df)(__m256d)(B), (int)(C), \ 833 (__v4df)_mm256_setzero_pd(), \ 836 #define _mm_range_ps(A, B, C) __extension__ ({ \ 837 (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ 838 (__v4sf)(__m128)(B), (int)(C), \ 839 (__v4sf)_mm_setzero_ps(), \ 842 #define _mm_mask_range_ps(W, U, A, B, C) __extension__ ({ \ 843 (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ 844 (__v4sf)(__m128)(B), (int)(C), \ 845 (__v4sf)(__m128)(W), (__mmask8)(U)); }) 847 #define _mm_maskz_range_ps(U, A, B, C) __extension__ ({ \ 848 (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ 849 (__v4sf)(__m128)(B), (int)(C), \ 850 (__v4sf)_mm_setzero_ps(), \ 853 #define _mm256_range_ps(A, B, C) __extension__ ({ \ 854 (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ 855 (__v8sf)(__m256)(B), (int)(C), \ 856 (__v8sf)_mm256_setzero_ps(), \ 859 #define _mm256_mask_range_ps(W, U, A, B, C) __extension__ ({ \ 860 (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ 861 (__v8sf)(__m256)(B), (int)(C), \ 862 (__v8sf)(__m256)(W), (__mmask8)(U)); }) 864 #define _mm256_maskz_range_ps(U, A, B, C) __extension__ ({ \ 865 (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ 866 (__v8sf)(__m256)(B), (int)(C), \ 867 (__v8sf)_mm256_setzero_ps(), \ 870 #define _mm_reduce_pd(A, B) __extension__ ({ \ 871 (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ 872 (__v2df)_mm_setzero_pd(), \ 875 #define _mm_mask_reduce_pd(W, U, A, B) __extension__ ({ \ 876 (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ 877 (__v2df)(__m128d)(W), \ 880 #define _mm_maskz_reduce_pd(U, A, B) __extension__ ({ \ 881 (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ 882 (__v2df)_mm_setzero_pd(), \ 885 #define _mm256_reduce_pd(A, B) __extension__ ({ \ 886 (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ 887 (__v4df)_mm256_setzero_pd(), \ 890 #define _mm256_mask_reduce_pd(W, U, A, B) __extension__ ({ \ 891 (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ 892 (__v4df)(__m256d)(W), \ 895 #define _mm256_maskz_reduce_pd(U, A, B) __extension__ ({ \ 896 (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ 897 (__v4df)_mm256_setzero_pd(), \ 900 #define _mm_reduce_ps(A, B) __extension__ ({ \ 901 (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ 902 (__v4sf)_mm_setzero_ps(), \ 905 #define _mm_mask_reduce_ps(W, U, A, B) __extension__ ({ \ 906 (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ 907 (__v4sf)(__m128)(W), \ 910 #define _mm_maskz_reduce_ps(U, A, B) __extension__ ({ \ 911 (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ 912 (__v4sf)_mm_setzero_ps(), \ 915 #define _mm256_reduce_ps(A, B) __extension__ ({ \ 916 (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ 917 (__v8sf)_mm256_setzero_ps(), \ 920 #define _mm256_mask_reduce_ps(W, U, A, B) __extension__ ({ \ 921 (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ 922 (__v8sf)(__m256)(W), \ 925 #define _mm256_maskz_reduce_ps(U, A, B) __extension__ ({ \ 926 (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ 927 (__v8sf)_mm256_setzero_ps(), \ 933 return (
__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
939 return (
__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
945 return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
951 return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
957 return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
963 return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
969 return (
__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
975 return (
__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
981 return (__m256)__builtin_shufflevector((__v4sf)__A,
983 0, 1, 0, 1, 0, 1, 0, 1);
989 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__M,
997 return (__m256)__builtin_ia32_selectps_256((
__mmask8)__M,
1005 return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
1012 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__M,
1020 return (__m256d)__builtin_ia32_selectpd_256((
__mmask8)__M,
1028 return (__m128i)__builtin_shufflevector((__v4si)__A,
1036 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
1044 return (__m128i)__builtin_ia32_selectd_128((
__mmask8)__M,
1052 return (__m256i)__builtin_shufflevector((__v4si)__A,
1054 0, 1, 0, 1, 0, 1, 0, 1);
1060 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
1068 return (__m256i)__builtin_ia32_selectd_256((
__mmask8)__M,
1076 return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
1083 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
1091 return (__m256i)__builtin_ia32_selectq_256((
__mmask8)__M,
1096 #define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \ 1097 (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A), \ 1098 (__v4df)_mm256_undefined_pd(), \ 1099 ((imm) & 1) ? 2 : 0, \ 1100 ((imm) & 1) ? 3 : 1); }) 1102 #define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \ 1103 (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ 1104 (__v2df)_mm256_extractf64x2_pd((A), (imm)), \ 1107 #define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \ 1108 (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ 1109 (__v2df)_mm256_extractf64x2_pd((A), (imm)), \ 1110 (__v2df)_mm_setzero_pd()); }) 1112 #define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \ 1113 (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A), \ 1114 (__v4di)_mm256_undefined_si256(), \ 1115 ((imm) & 1) ? 2 : 0, \ 1116 ((imm) & 1) ? 3 : 1); }) 1118 #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \ 1119 (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ 1120 (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \ 1123 #define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \ 1124 (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ 1125 (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \ 1126 (__v2di)_mm_setzero_di()); }) 1128 #define _mm256_insertf64x2(A, B, imm) __extension__ ({ \ 1129 (__m256d)__builtin_shufflevector((__v4df)(A), \ 1130 (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \ 1131 ((imm) & 0x1) ? 0 : 4, \ 1132 ((imm) & 0x1) ? 1 : 5, \ 1133 ((imm) & 0x1) ? 4 : 2, \ 1134 ((imm) & 0x1) ? 5 : 3); }) 1136 #define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \ 1137 (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ 1138 (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ 1141 #define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \ 1142 (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ 1143 (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ 1144 (__v4df)_mm256_setzero_pd()); }) 1146 #define _mm256_inserti64x2(A, B, imm) __extension__ ({ \ 1147 (__m256i)__builtin_shufflevector((__v4di)(A), \ 1148 (__v4di)_mm256_castsi128_si256((__m128i)(B)), \ 1149 ((imm) & 0x1) ? 0 : 4, \ 1150 ((imm) & 0x1) ? 1 : 5, \ 1151 ((imm) & 0x1) ? 4 : 2, \ 1152 ((imm) & 0x1) ? 5 : 3); }) 1154 #define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \ 1155 (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ 1156 (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ 1159 #define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \ 1160 (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ 1161 (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ 1162 (__v4di)_mm256_setzero_si256()); }) 1164 #define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \ 1165 (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ 1168 #define _mm_fpclass_pd_mask(A, imm) __extension__ ({ \ 1169 (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ 1172 #define _mm256_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \ 1173 (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ 1176 #define _mm256_fpclass_pd_mask(A, imm) __extension__ ({ \ 1177 (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ 1180 #define _mm_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \ 1181 (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ 1184 #define _mm_fpclass_ps_mask(A, imm) __extension__ ({ \ 1185 (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ 1188 #define _mm256_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \ 1189 (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ 1192 #define _mm256_fpclass_ps_mask(A, imm) __extension__ ({ \ 1193 (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ 1196 #undef __DEFAULT_FN_ATTRS static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi64_ps(__m128i __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_cvttpd_epi64(__m256i __W, __mmask8 __U, __m256d __A)
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void)
Constructs a 256-bit floating-point vector of [8 x float] with all vector elements initialized to zer...
static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_movepi64_mask(__m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_cvttps_epu64(__mmask8 __U, __m128 __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi64(__m128 __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_f64x2(__m128d __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m128i __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm256_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m256i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epu64(__m128 __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_cvtpd_epu64(__m128i __W, __mmask8 __U, __m128d __A)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi64_pd(__m128i __A)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_xor_pd(__mmask8 __U, __m128d __A, __m128d __B)
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_or_ps(__m256 __a, __m256 __b)
Performs a bitwise OR of two 256-bit vectors of [8 x float].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_broadcast_i32x2(__m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_cvtpd_epi64(__mmask8 __U, __m128d __A)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B)
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_and_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float].
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi64(__m128d __A)
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epu64(__m128d __A)
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cvtpd_epu64(__m256d __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_cvttps_epi64(__m256i __W, __mmask8 __U, __m128 __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvttpd_epu64(__mmask8 __U, __m256d __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epu64(__m128d __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm256_cvtepi64_ps(__m256i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_cvtps_epu64(__m128i __W, __mmask8 __U, __m128 __A)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_movepi32_mask(__m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_undefined_ps(void)
Create a 128-bit vector of [4 x float] with undefined values.
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_and_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double].
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_cvtepi64_pd(__mmask8 __U, __m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_cvttpd_epi64(__m128i __W, __mmask8 __U, __m128d __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_movm_epi32(__mmask8 __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvttpd_epi64(__mmask8 __U, __m256d __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi64(__m256d __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_cvtps_epi64(__m256i __W, __mmask8 __U, __m128 __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m128i __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_cvtepi64_pd(__m256d __W, __mmask8 __U, __m256i __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm256_maskz_cvtepi64_ps(__mmask8 __U, __m256i __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm256_maskz_cvtepu64_ps(__mmask8 __U, __m256i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvtps_epi64(__mmask8 __U, __m128 __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movm_epi32(__mmask8 __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mullo_epi64(__m256i __A, __m256i __B)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm256_cvtepu64_ps(__m256i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvttps_epu64(__mmask8 __U, __m128 __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_cvttpd_epu64(__m256i __W, __mmask8 __U, __m256d __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_cvtepu64_pd(__mmask8 __U, __m256i __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epu64(__m128 __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void)
Constructs a 128-bit floating-point vector of [4 x float] initialized to zero.
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_cvtepu64_pd(__mmask8 __U, __m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void)
Constructs a 128-bit floating-point vector of [2 x double] initialized to zero.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_cvtps_epi64(__m128i __W, __mmask8 __U, __m128 __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm256_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m256i __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b)
Performs a bitwise exclusive OR of two 128-bit vectors of [4 x float].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_cvtpd_epu64(__mmask8 __U, __m128d __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_cvtepu64_pd(__m256i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvtpd_epi64(__mmask8 __U, __m256d __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_cvttps_epi64(__mmask8 __U, __m128 __A)
#define __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi64_pd(__m256i __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_cvtepi64_pd(__m128d __W, __mmask8 __U, __m128i __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_cvttps_epu64(__m256i __W, __mmask8 __U, __m128 __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvtpd_epu64(__mmask8 __U, __m256d __A)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b)
Performs a bitwise OR of two 128-bit vectors of [2 x double].
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_cvttpd_epu64(__m128i __W, __mmask8 __U, __m128d __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi64(__m128 __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_cvtepi64_ps(__mmask8 __U, __m128i __A)
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_f32x2(__m128 __A)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double].
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_broadcast_i64x2(__m128i __A)
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_andnot_pd(__m256d __a, __m256d __b)
Performs a bitwise AND of two 256-bit vectors of [4 x double], using the one's complement of the valu...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_cvtps_epu64(__mmask8 __U, __m128 __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_maskz_cvtepu64_ps(__mmask8 __U, __m128i __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float], using the one's complement of the value...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_cvttpd_epi64(__mmask8 __U, __m128d __A)
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_movm_epi64(__mmask8 __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvtps_epu64(__mmask8 __U, __m128 __A)
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A)
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a, __m256 __b)
Performs a bitwise XOR of two 256-bit vectors of [8 x float].
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi64(__m128 __A)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b)
Performs a bitwise XOR of two 128-bit vectors of [2 x double].
static __inline __m256 __DEFAULT_FN_ATTRS _mm256_andnot_ps(__m256 __a, __m256 __b)
Performs a bitwise AND of two 256-bit vectors of [8 x float], using the one's complement of the value...
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void)
Creates a 128-bit integer vector initialized to zero.
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b)
Performs a bitwise AND of two 128-bit vectors of [2 x double], using the one's complement of the valu...
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi64(__m128i __A, __m128i __B)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepu64_pd(__m128i __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepu64_ps(__m128i __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi64(__m256d __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epu64(__m128 __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_cvtepu64_pd(__m256d __W, __mmask8 __U, __m256i __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void)
Generates a 128-bit vector of [4 x i32] with unspecified content.
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_cvttps_epu64(__m128i __W, __mmask8 __U, __m128 __A)
static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_movepi32_mask(__m256i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi64(__m128 __A)
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A)
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b)
Performs a bitwise OR of two 128-bit vectors of [4 x float].
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_cvtpd_epu64(__m256i __W, __mmask8 __U, __m256d __A)
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_xor_pd(__m256d __a, __m256d __b)
Performs a bitwise XOR of two 256-bit vectors of [4 x double].
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_cvtpd_epi64(__m256i __W, __mmask8 __U, __m256d __A)
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B)
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mask_cvtepu64_pd(__m128d __W, __mmask8 __U, __m128i __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_cvttps_epi64(__m128i __W, __mmask8 __U, __m128 __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi64(__m128d __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_cvtps_epu64(__m256i __W, __mmask8 __U, __m128 __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_cvtepi64_pd(__mmask8 __U, __m256i __A)
static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void)
Constructs a 256-bit integer vector initialized to zero.
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b)
Performs a bitwise AND of two 128-bit vectors of [4 x float].
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B)
static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_movepi64_mask(__m256i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movm_epi64(__mmask8 __A)
static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_broadcast_i32x2(__m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A)
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void)
Constructs a 256-bit floating-point vector of [4 x double] with all vector elements initialized to ze...
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epu64(__m128 __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cvttpd_epu64(__m256d __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_cvttps_epi64(__mmask8 __U, __m128 __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_cvttpd_epu64(__mmask8 __U, __m128d __A)
static __inline __m256d __DEFAULT_FN_ATTRS _mm256_or_pd(__m256d __a, __m256d __b)
Performs a bitwise OR of two 256-bit vectors of [4 x double].
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_cvtpd_epi64(__m128i __W, __mmask8 __U, __m128d __A)
static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_cvtps_epi64(__mmask8 __U, __m128 __A)