我创建了一个使用SIMD进行64位*64位到128位的功能.目前我已经使用SSE2(acutally SSE4.1)实现了它.这意味着它可以同时运行两个64b*64b到128b的产品.同样的想法可以扩展到AVX2或AVX512,同时提供四个或八个64b*64到128b的产品.我的算法基于http://www.hackersdelight.org/hdcodetxt/muldws.c.txt
该算法进行一次无符号乘法,一次有符号乘法和两次有符号*无符号乘法.签名的*signed和unsigned*unsigned操作很容易使用_mm_mul_epi32
和_mm_mul_epu32
.但混合签名和未签名的产品给我带来了麻烦.例如,考虑一下.
int32_t x = 0x80000000; uint32_t y = 0x7fffffff; int64_t z = (int64_t)x*y;
双字产品应该是0xc000000080000000
.但是如果你假设你的编译器知道如何处理混合类型,你怎么能得到这个呢?这就是我想出的:
int64_t sign = x<0; sign*=-1; //get the sign and make it all ones uint32_t t = abs(x); //if x<0 take two's complement again uint64_t prod = (uint64_t)t*y; //unsigned product int64_t z = (prod ^ sign) - sign; //take two's complement based on the sign
使用SSE可以这样做
__m128i xh; //(xl2, xh2, xl1, xh1) high is signed, low unsigned __m128i yl; //(yh2, yl2, yh2, yl2) __m128i xs = _mm_cmpgt_epi32(_mm_setzero_si128(), xh); // get sign xs = _mm_shuffle_epi32(xs, 0xA0); // extend sign __m128i t = _mm_sign_epi32(xh,xh); // abs(xh) __m128i prod = _mm_mul_epu32(t, yl); // unsigned (xh2*yl2,xh1*yl1) __m128i inv = _mm_xor_si128(prod,xs); // invert bits if negative __m128i z = _mm_sub_epi64(inv,xs); // add 1 if negative
这给出了正确的结果.但是我必须做两次(平时一次)它现在是我功能的重要部分.使用SSE4.2,AVX2(四个128位产品),甚至AVX512(八个128位产品)是否有更有效的方法?
也许有比SIMD更有效的方法吗?获得高位字是很多计算方法.
编辑:根据@ElderBug的评论,看起来这样做的方法不是使用SIMD而是使用mul
指令.对于它的价值,如果有人想要看到这是多么复杂,这里是完整的工作功能(我只是让它工作,所以我没有优化它,但我不认为这是值得的).
void muldws1_sse(__m128i x, __m128i y, __m128i *lo, __m128i *hi) { __m128i lomask = _mm_set1_epi64x(0xffffffff); __m128i xh = _mm_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h __m128i yh = _mm_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h __m128i xs = _mm_cmpgt_epi32(_mm_setzero_si128(), xh); __m128i ys = _mm_cmpgt_epi32(_mm_setzero_si128(), yh); xs = _mm_shuffle_epi32(xs, 0xA0); ys = _mm_shuffle_epi32(ys, 0xA0); __m128i w0 = _mm_mul_epu32(x, y); // x0l*y0l, y0l*y0h __m128i w3 = _mm_mul_epi32(xh, yh); // x0h*y0h, x1h*y1h xh = _mm_sign_epi32(xh,xh); yh = _mm_sign_epi32(yh,yh); __m128i w1 = _mm_mul_epu32(x, yh); // x0l*y0h, x1l*y1h __m128i w2 = _mm_mul_epu32(xh, y); // x0h*y0l, x1h*y0l __m128i yinv = _mm_xor_si128(w1,ys); // invert bits if negative w1 = _mm_sub_epi64(yinv,ys); // add 1 __m128i xinv = _mm_xor_si128(w2,xs); // invert bits if negative w2 = _mm_sub_epi64(xinv,xs); // add 1 __m128i w0l = _mm_and_si128(w0, lomask); __m128i w0h = _mm_srli_epi64(w0, 32); __m128i s1 = _mm_add_epi64(w1, w0h); // xl*yh + w0h; __m128i s1l = _mm_and_si128(s1, lomask); // lo(wl*yh + w0h); __m128i s1h = _mm_srai_epi64(s1, 32); __m128i s2 = _mm_add_epi64(w2, s1l); //xh*yl + s1l __m128i s2l = _mm_slli_epi64(s2, 32); __m128i s2h = _mm_srai_epi64(s2, 32); //arithmetic shift right __m128i hi1 = _mm_add_epi64(w3, s1h); hi1 = _mm_add_epi64(hi1, s2h); __m128i lo1 = _mm_add_epi64(w0l, s2l); *hi = hi1; *lo = lo1; }
它变得更糟.在_mm_srai_epi64
AVX512之前没有任何内在/指令,所以我必须自己制作.
static inline __m128i _mm_srai_epi64(__m128i a, int b) { __m128i sra = _mm_srai_epi32(a,32); __m128i srl = _mm_srli_epi64(a,32); __m128i mask = _mm_set_epi32(-1,0,-1,0); __m128i out = _mm_blendv_epi8(srl, sra, mask); }
我_mm_srai_epi64
上面的实现是不完整的.我想我正在使用Agner Fog的Vector Class Library.如果您查看文件vectori128.h,您会发现
static inline Vec2q operator >> (Vec2q const & a, int32_t b) { // instruction does not exist. Split into 32-bit shifts if (b <= 32) { __m128i bb = _mm_cvtsi32_si128(b); // b __m128i sra = _mm_sra_epi32(a,bb); // a >> b signed dwords __m128i srl = _mm_srl_epi64(a,bb); // a >> b unsigned qwords __m128i mask = _mm_setr_epi32(0,-1,0,-1); // mask for signed high part return selectb(mask,sra,srl); } else { // b > 32 __m128i bm32 = _mm_cvtsi32_si128(b-32); // b - 32 __m128i sign = _mm_srai_epi32(a,31); // sign of a __m128i sra2 = _mm_sra_epi32(a,bm32); // a >> (b-32) signed dwords __m128i sra3 = _mm_srli_epi64(sra2,32); // a >> (b-32) >> 32 (second shift unsigned qword) __m128i mask = _mm_setr_epi32(0,-1,0,-1); // mask for high part containing only sign return selectb(mask,sign,sra3); } }
Z boson.. 9
我发现SIMD解决方案更简单,不需要signed*unsigned
产品. 我不再相信SIMD(至少与AVX2和AV512一样)无法与之竞争在某些情况下,SIMD可以与之竞争mulx
.mulx
.我所知道的唯一情况是基于FFT的大数乘法.
诀窍是先做无符号乘法,然后再纠正.我从这个答案32位有符号乘法 - 不使用64位数据类型中学到了如何做到这一点.对于(hi,lo) = x*y
无符号乘法进行校正很简单,然后hi
像这样校正:
hi -= ((x<0) ? y : 0) + ((y<0) ? x : 0)
这可以通过SSE4.2内在来完成 _mm_cmpgt_epi64
void muldws1_sse(__m128i x, __m128i y, __m128i *lo, __m128i *hi) { muldwu1_sse(x,y,lo,hi); //hi -= ((x<0) ? y : 0) + ((y<0) ? x : 0); __m128i xs = _mm_cmpgt_epi64(_mm_setzero_si128(), x); __m128i ys = _mm_cmpgt_epi64(_mm_setzero_si128(), y); __m128i t1 = _mm_and_si128(y,xs); __m128i t2 = _mm_and_si128(x,ys); *hi = _mm_sub_epi64(*hi,t1); *hi = _mm_sub_epi64(*hi,t2); }
无符号乘法的代码更简单,因为它不需要混合signed*unsigned
产品.另外,因为它是无符号的,所以它不需要算术右移,它只有AVX512的指令.实际上以下功能只需要SSE2:
void muldwu1_sse(__m128i x, __m128i y, __m128i *lo, __m128i *hi) { __m128i lomask = _mm_set1_epi64x(0xffffffff); __m128i xh = _mm_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h __m128i yh = _mm_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h __m128i w0 = _mm_mul_epu32(x, y); // x0l*y0l, x1l*y1l __m128i w1 = _mm_mul_epu32(x, yh); // x0l*y0h, x1l*y1h __m128i w2 = _mm_mul_epu32(xh, y); // x0h*y0l, x1h*y0l __m128i w3 = _mm_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h __m128i w0l = _mm_and_si128(w0, lomask); //(*) __m128i w0h = _mm_srli_epi64(w0, 32); __m128i s1 = _mm_add_epi64(w1, w0h); __m128i s1l = _mm_and_si128(s1, lomask); __m128i s1h = _mm_srli_epi64(s1, 32); __m128i s2 = _mm_add_epi64(w2, s1l); __m128i s2l = _mm_slli_epi64(s2, 32); //(*) __m128i s2h = _mm_srli_epi64(s2, 32); __m128i hi1 = _mm_add_epi64(w3, s1h); hi1 = _mm_add_epi64(hi1, s2h); __m128i lo1 = _mm_add_epi64(w0l, s2l); //(*) //__m128i lo1 = _mm_mullo_epi64(x,y); //alternative *hi = hi1; *lo = lo1; }
这用
4x mul_epu32 5x add_epi64 2x shuffle_epi32 2x and 2x srli_epi64 1x slli_epi64 **************** 16 instructions
AVX512具有_mm_mullo_epi64
可以lo
用一条指令计算的内在函数.在这种情况下,可以使用替代方法(使用(*)注释注释行并取消注释替代行):
5x mul_epu32 4x add_epi64 2x shuffle_epi32 1x and 2x srli_epi64 **************** 14 instructions
若要更改整个宽度AVX2的代码替换_mm
用_mm256
,si128
用si256
,并__m128i
具有__m256i
对AVX512与替换它们_mm512
,si512
和__m512i
.
我发现SIMD解决方案更简单,不需要signed*unsigned
产品. 我不再相信SIMD(至少与AVX2和AV512一样)无法与之竞争在某些情况下,SIMD可以与之竞争mulx
.mulx
.我所知道的唯一情况是基于FFT的大数乘法.
诀窍是先做无符号乘法,然后再纠正.我从这个答案32位有符号乘法 - 不使用64位数据类型中学到了如何做到这一点.对于(hi,lo) = x*y
无符号乘法进行校正很简单,然后hi
像这样校正:
hi -= ((x<0) ? y : 0) + ((y<0) ? x : 0)
这可以通过SSE4.2内在来完成 _mm_cmpgt_epi64
void muldws1_sse(__m128i x, __m128i y, __m128i *lo, __m128i *hi) { muldwu1_sse(x,y,lo,hi); //hi -= ((x<0) ? y : 0) + ((y<0) ? x : 0); __m128i xs = _mm_cmpgt_epi64(_mm_setzero_si128(), x); __m128i ys = _mm_cmpgt_epi64(_mm_setzero_si128(), y); __m128i t1 = _mm_and_si128(y,xs); __m128i t2 = _mm_and_si128(x,ys); *hi = _mm_sub_epi64(*hi,t1); *hi = _mm_sub_epi64(*hi,t2); }
无符号乘法的代码更简单,因为它不需要混合signed*unsigned
产品.另外,因为它是无符号的,所以它不需要算术右移,它只有AVX512的指令.实际上以下功能只需要SSE2:
void muldwu1_sse(__m128i x, __m128i y, __m128i *lo, __m128i *hi) { __m128i lomask = _mm_set1_epi64x(0xffffffff); __m128i xh = _mm_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h __m128i yh = _mm_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h __m128i w0 = _mm_mul_epu32(x, y); // x0l*y0l, x1l*y1l __m128i w1 = _mm_mul_epu32(x, yh); // x0l*y0h, x1l*y1h __m128i w2 = _mm_mul_epu32(xh, y); // x0h*y0l, x1h*y0l __m128i w3 = _mm_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h __m128i w0l = _mm_and_si128(w0, lomask); //(*) __m128i w0h = _mm_srli_epi64(w0, 32); __m128i s1 = _mm_add_epi64(w1, w0h); __m128i s1l = _mm_and_si128(s1, lomask); __m128i s1h = _mm_srli_epi64(s1, 32); __m128i s2 = _mm_add_epi64(w2, s1l); __m128i s2l = _mm_slli_epi64(s2, 32); //(*) __m128i s2h = _mm_srli_epi64(s2, 32); __m128i hi1 = _mm_add_epi64(w3, s1h); hi1 = _mm_add_epi64(hi1, s2h); __m128i lo1 = _mm_add_epi64(w0l, s2l); //(*) //__m128i lo1 = _mm_mullo_epi64(x,y); //alternative *hi = hi1; *lo = lo1; }
这用
4x mul_epu32 5x add_epi64 2x shuffle_epi32 2x and 2x srli_epi64 1x slli_epi64 **************** 16 instructions
AVX512具有_mm_mullo_epi64
可以lo
用一条指令计算的内在函数.在这种情况下,可以使用替代方法(使用(*)注释注释行并取消注释替代行):
5x mul_epu32 4x add_epi64 2x shuffle_epi32 1x and 2x srli_epi64 **************** 14 instructions
若要更改整个宽度AVX2的代码替换_mm
用_mm256
,si128
用si256
,并__m128i
具有__m256i
对AVX512与替换它们_mm512
,si512
和__m512i
.
使用各种指令考虑整数乘法的吞吐量限制的正确方法是根据每个周期可以计算多少"产品位".
mulx
每周期生成一个64x64 - > 128个结果; 这是每周期64x64 = 4096"产品位"
如果将SIMD上的乘法器与32x32 - > 64位乘法的指令拼接在一起,则需要能够在每个周期得到4个结果才能匹配mulx
(4x32x32 = 4096).如果除了乘法之外没有算术,你只需要在AVX2上收支平衡.不幸的是,正如你已经注意到的那样,除了乘法之外还有很多算术,所以这是当前一代硬件的非启动性.