May be a variant :) :
void tom::add(short* pshData, int iCount)
{
if (4 * 4 <= iCount) {
int* piData = new int[iCount];
while (iCount--) {
piData[iCount] = pshData[iCount];
}
_m128i* p128Data = (_m128i*) piData;
_m128i f0 = _mm_load_si128(p128Data++);
_m128i f1 = _mm_load_si128(p128Data++);
_m128i f2 = _mm_load_si128(p128Data++);
_m128i f3 = _mm_load_si128(p128Data);
delete[] piData;
}
}