A little example to evaluate:
double FreqTime(const LARGE_INTEGER& t,const LARGE_INTEGER& f)
{
typedef struct { static double tod(const LARGE_INTEGER& li){ return (4294967296.0*li.HighPart+li.LowPart); } }_;
return 1e3*_::tod(t)/_::tod(f);
}
void XOR(char* inBuf1, char* inBuf2, char*outBuf, unsigned int size)
{
while(size--)
{
*outBuf++ = *inBuf1++ ^ *inBuf2++;
}
}
template <class TI>
void tXOR(char* i1, char* i2, char* o, unsigned int cb)
{
unsigned int i;
TI* ii1 = (TI*)i1;
TI* ii2 = (TI*)i2;
TI* oo = (TI*)o;
unsigned int nn = cb/sizeof(TI);
unsigned int dd = sizeof(TI)*nn;
for(i=0;i<nn;i++) oo[i]=ii1[i]^ii2[i];
if(dd<cb) tXOR<char>(i1+dd,i2+dd,o+dd,cb-dd);
}
template <>
void tXOR<__m128>(char* i1, char* i2, char* o, unsigned int cb)
{
unsigned int i;
__m128* ii1 = (__m128*)i1;
__m128* ii2 = (__m128*)i2;
__m128* oo = (__m128*)o;
unsigned int nn = cb/sizeof(__m128);
unsigned int dd = sizeof(__m128)*nn;
for(i=0;i<nn;i++) oo[i]=_mm_xor_ps(ii1[i],ii2[i]);
if(dd<cb) tXOR<char>(i1+dd,i2+dd,o+dd,cb-dd);
}
int _tmain(int argc, _TCHAR* argv[])
{
LARGE_INTEGER freq = {0,0};
LARGE_INTEGER t0 = {0,0};
LARGE_INTEGER t1 = {0,0};
QueryPerformanceFrequency(&freq);
enum{ BUFFSIZE=1<<20, LOOPS=1000, };
unsigned int i;
char* in1 = (char*)malloc(BUFFSIZE);
char* in2 = (char*)malloc(BUFFSIZE);
char* out = (char*)malloc(BUFFSIZE);
QueryPerformanceCounter(&t0);
srand(t0.LowPart);
for(i=0;i<BUFFSIZE;i++)
{
in1[i] = MulDiv(rand(),1,RAND_MAX);
in2[i] = MulDiv(rand(),1,RAND_MAX);
}
QueryPerformanceCounter(&t0);
for(i=0;i<LOOPS;i++) XOR(in1,in2,out,BUFFSIZE);
QueryPerformanceCounter(&t1);
_tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("XOR "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));
QueryPerformanceCounter(&t0);
for(i=0;i<LOOPS;i++) tXOR<char>(in1,in2,out,BUFFSIZE);
QueryPerformanceCounter(&t1);
_tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<char> "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));
QueryPerformanceCounter(&t0);
for(i=0;i<LOOPS;i++) tXOR<short>(in1,in2,out,BUFFSIZE);
QueryPerformanceCounter(&t1);
_tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<short> "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));
QueryPerformanceCounter(&t0);
for(i=0;i<LOOPS;i++) tXOR<unsigned int>(in1,in2,out,BUFFSIZE);
QueryPerformanceCounter(&t1);
_tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<unsigned int>"),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));
QueryPerformanceCounter(&t0);
for(i=0;i<LOOPS;i++) tXOR<__int64>(in1,in2,out,BUFFSIZE);
QueryPerformanceCounter(&t1);
_tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<__int64> "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));
QueryPerformanceCounter(&t0);
for(i=0;i<LOOPS;i++) tXOR<__m128>(in1,in2,out,BUFFSIZE);
QueryPerformanceCounter(&t1);
_tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<__m128> "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));
free(in1);
free(in2);
free(out);
_getch();
return 0;
}
output:
XOR (1000,1048576) = 1447.370089ms
tXOR<char> (1000,1048576) = 993.742449ms
tXOR<short> (1000,1048576) = 511.465385ms
tXOR<unsigned int>(1000,1048576) = 334.088394ms
tXOR<__int64> (1000,1048576) = 232.502586ms
tXOR<__m128> (1000,1048576) = 201.321703ms
Tested on i3 CPU.
Regards.