#include "intrin.h"
inline int round(double x)
{
return(_mm_cvtsd_si32(_mm_set_sd(x)));
}
inline int trunc(double x)
{
return(_mm_cvttsd_si32(_mm_set_sd(x)));
}
inline __int64 round64(double x)
{
return(_mm_cvtsd_si64x(_mm_set_sd(x)));
}
inline __int64 trunc64(double x)
{
return(_mm_cvttsd_si64x(_mm_set_sd(x)));
}
I benchmarked these carefully on Intel Xeon E5520. The intrinsic SSE round is almost twice as fast as the conditional offset and trunc method:
inline int round(double x)
{
return(int(x > 0 ? x + 0.5 : x - 0.5));
}
Which is understandable, since the intrinsic SSE round compiles to a single instruction:
cvtsd2si eax,xmm0
whereas the conditional offset and trunc method compiles to:
xorpd xmm7,xmm7 ; xmm7 = 0
movsd xmm2,0.5 ; xmm2 = 0.5
comisd xmm1,xmm7 ; x > 0?
jbe $1 ; n, skip to neg case
movsd xmm0,xmm1
addsd xmm0,xmm2 ; x += 0.5
jmp $2
$1:
movsd xmm0,xmm1
subsd xmm0,xmm2 ; x -= 0.5
$2:
cvttsd2si eax,xmm0 ; eax = trunc(x)
No comments:
Post a Comment