Is it OK?
void mult4x4(short m1[4][4], short m2[4][4], short res[4][4])
{
__asm
{
push esi
push edi
mov eax, m1
mov esi, m2
mov edi, res
; cache mem. access into free mmx reg.
movq mm5, [esi + 24]
mov edx, 8
mov ecx, 4
__repeat:
movq mm7, [eax]
movq mm0, [esi]
movq mm2, [esi + edx]
movq mm1, mm0
movq mm6, mm7
punpckldq mm7, mm6
punpcklwd mm0, mm2
punpckhwd mm1, mm2
pmaddwd mm0, mm7
pmaddwd mm1, mm7
movq mm3, mm0
movq mm4, mm1
movq mm7, mm6
punpckhdq mm7, mm6
movq mm0, [esi + edx * 2]
movq mm2, mm5 ; read row from reg. cache
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
pmaddwd mm0, mm7
pmaddwd mm1, mm7
paddd mm3, mm0
paddd mm4, mm1
packssdw mm3, mm3
packssdw mm4, mm4
punpckldq mm3, mm4
movq [edi], mm3
add edi, edx
add eax, edx
loop __repeat
emms
pop edi
pop esi
}
}
int main(void)
{
__declspec(align(16)) short m1[4][4] = { 1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4 };
__declspec(align(16)) short m2[4][4] = { 9,9,9,9,8,8,8,8,7,7,7,7,6,6,6,6 };
__declspec(align(16)) short result[4][4] = {0};
mult4x4(m1, m2, result);
for(int r = 0; r < 4; r++)
{
for(int c = 0; c < 4; c++)
cout << "\t" << result[r][c];
cout << endl;
}
cout << endl;
return 0;
}