#include <intrin.h>
#include <math.h>
struct Vector4
{
float x, y, z, w;
};
struct Matrix
{
float _M[][];
public:
//单位化
void Identity()
{
ZeroMemory((void*)_M,sizeof(float)*);
_M[][] = 1.0f;
_M[][] = 1.0f;
_M[][] = 1.0f;
_M[][] = 1.0f;
}
//转置矩阵
Matrix Transpose()
{
Matrix ret;
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
ret._M[][] = _M[][];
return ret;
}
};
float Sum(const Vector4 & arg1)
{
return arg1.x + arg1.y + arg1.z + arg1.w;
}
Vector4 Normaliz(Vector4 v1)
{
Vector4 v2;
float k = /sqrtf(v1.x * v1.x + v1.y * v1.y + v1.z * v1.z);
v2.x = v1.x * k;
v2.y = v1.y * k;
v2.z = v1.z * k;
return v2;
}
//向量相加
Vector4 SSE_VectiorAdd ( const Vector4 &Op_A, const Vector4 &Op_B )
{
Vector4 Ret_Vector;
__asm
{
MOV EAX, Op_A // Load pointers into CPU regs
MOV EBX, Op_B
MOVUPS XMM0, [EAX] // Move unaligned vectors to SSE regs
MOVUPS XMM1, [EBX]
ADDPS XMM0, XMM1 // v1 + v2
MOVUPS [Ret_Vector], XMM0 // Save the return vector
}
return Ret_Vector;
}
//向量叉乘
Vector4 SSE_VectorCross(const Vector4 &Op_A, const Vector4 &Op_B)
{
Vector4 Ret_Vector;
__asm
{
MOV EAX, Op_A // Load pointers into CPU regs
MOV EBX, Op_B
MOVUPS XMM0, [EAX] // Move unaligned vectors to SSE regs
MOVUPS XMM1, [EBX]
MOVAPS XMM2, XMM0 // Make a copy of vector A
MOVAPS XMM3, XMM1 // Make a copy of vector B
SHUFPS XMM0, XMM0, 0xD8 // 11 01 10 00 Flip the middle elements of A
SHUFPS XMM1, XMM1, 0xE1 // 11 10 00 01 Flip first two elements of B
MULPS XMM0, XMM1 // Multiply the modified register vectors
SHUFPS XMM2, XMM2, 0xE1 // 11 10 00 01 Flip first two elements of the A copy
SHUFPS XMM3, XMM3, 0xD8 // 11 01 10 00 Flip the middle elements of the B copy
MULPS XMM2, XMM3 // Multiply the modified register vectors
SUBPS XMM0, XMM2 // Subtract the two resulting register vectors
MOVUPS [Ret_Vector], XMM0 // Save the return vector
}
return Ret_Vector;
}
//向量缩放
Vector4 SSE_VectorScale(const Vector4 &Op_A, const float &Op_B)
{
Vector4 Ret_Vector;
__m128 F = _mm_set1_ps(Op_B); // Create a 128 bit vector with four elements Op_B
__asm
{
MOV EAX, Op_A // Load pointer into CPU reg
MOVUPS XMM0, [EAX] // Move the vector to an SSE reg
MULPS XMM0, F // Multiply vectors
MOVUPS [Ret_Vector], XMM0 // Save the return vector
}
return Ret_Vector;
}
void SSE_VectorDot(const Vector4 &Op_A, const Vector4 &Op_B,float& ret)
{
Vector4 v1;
//__m128 F = _mm_set1_ps(Op_B); // Create a 128 bit vector with four elements Op_B
__asm
{
MOV EAX, Op_A // Load pointer into CPU reg
MOV EBX, Op_B
MOVUPS XMM1, [EBX]
MOVUPS XMM0, [EAX] // Move the vector to an SSE reg
MULPS XMM0,XMM1 // Multiply vectors
MOVUPS [v1], XMM0 // Save the return vector
}
ret = v1.x + v1.y + v1.z;
}
void SSE_VectorMultiplyMatrix(const Vector4& v,const Matrix& m1,Vector4& ret)
{
Vector4 va,vb,vc,vd;
Vector4 *pva,*pvb,*pvc,*pvd;
const Vector4 *pv;
//取出矩阵每一列
va.x = m1._M[][];
va.y = m1._M[][];
va.z = m1._M[][];
va.w = m1._M[][];
vb.x = m1._M[][];
vb.y = m1._M[][];
vb.z = m1._M[][];
vb.w = m1._M[][];
vc.x = m1._M[][];
vc.y = m1._M[][];
vc.z = m1._M[][];
vc.w = m1._M[][];
vd.x = m1._M[][];
vd.y = m1._M[][];
vd.z = m1._M[][];
vd.w = m1._M[][];
pva = &va;
pvb = &vb;
pvc = &vc;
pvd = &vd;
pv = &v;
__asm
{
//矩阵四列放入mmx0-mmx3
MOV EAX, pva // Load pointer into CPU reg
MOVUPS XMM0, [EAX]
MOV EAX, pvb // Load pointer into CPU reg
MOVUPS XMM1, [EAX]
MOV EAX, pvc // Load pointer into CPU reg
MOVUPS XMM2, [EAX]
MOV EAX, pvd // Load pointer into CPU reg
MOVUPS XMM3, [EAX]
//向量放入 mmx4
MOV EAX, pv
MOVUPS XMM4, [EAX]
//向量点乘矩阵每列
MULPS XMM0,XMM4
MULPS XMM1,XMM4
MULPS XMM2,XMM4
MULPS XMM3,XMM4
//输出四个分量
MOVUPS [va], XMM0
MOVUPS [vb], XMM1
MOVUPS [vc], XMM2
MOVUPS [vd], XMM3
}
//四个分量求和得变换后向量
ret.x = va.w + va.x + va.y + va.z;
ret.y = vb.w + vb.x + vb.y + vb.z;
ret.z = vc.w + vc.x + vc.y + vc.z;
ret.w = vd.w + vd.x + vd.y + vd.z;
}
void SSE_MatrixMultiplyMatrix(const Matrix& arg1,const Matrix& arg2,Matrix & ret)
{
Matrix m1,m2;
Vector4 v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16;
Vector4 *pv1,*pv2,*pv3,*pv4,*pv5,*pv6,*pv7,*pv8;
m1 = arg1;
m2 = arg2;
m2 = m2.Transpose();
pv1 = (Vector4*)&m1._M[][];
pv2 = (Vector4*)&m1._M[][];
pv3 = (Vector4*)&m1._M[][];
pv4 = (Vector4*)&m1._M[][];
pv5 = (Vector4*)&m2._M[][];
pv6 = (Vector4*)&m2._M[][];
pv7 = (Vector4*)&m2._M[][];
pv8 = (Vector4*)&m2._M[][];
__asm
{
MOV EAX, pv5
MOV EBX, pv6
MOV ECX, pv7
MOV EDX, pv8
MOVUPS XMM1, [EAX]
MOVUPS XMM2, [EBX]
MOVUPS XMM3, [ECX]
MOVUPS XMM4, [EDX]
MOV EAX, pv1
MOVUPS XMM0, [EAX]
MULPS XMM1, XMM0
MOVUPS[v1], XMM1
MULPS XMM2, XMM0
MOVUPS[v2], XMM2
MULPS XMM3, XMM0
MOVUPS[v3], XMM3
MULPS XMM4, XMM0
MOVUPS[v4], XMM4
MOV EAX, pv5
MOV EBX, pv6
MOV ECX, pv7
MOV EDX, pv8
MOVUPS XMM1, [EAX]
MOVUPS XMM2, [EBX]
MOVUPS XMM3, [ECX]
MOVUPS XMM4, [EDX]
MOV EAX, pv2
MOVUPS XMM0, [EAX]
MULPS XMM1, XMM0
MOVUPS[v5], XMM1
MULPS XMM2, XMM0
MOVUPS[v6], XMM2
MULPS XMM3, XMM0
MOVUPS[v7], XMM3
MULPS XMM4, XMM0
MOVUPS[v8], XMM4
MOV EAX, pv5
MOV EBX, pv6
MOV ECX, pv7
MOV EDX, pv8
MOVUPS XMM1, [EAX]
MOVUPS XMM2, [EBX]
MOVUPS XMM3, [ECX]
MOVUPS XMM4, [EDX]
MOV EAX, pv3
MOVUPS XMM0, [EAX]
MULPS XMM1, XMM0
MOVUPS[v9], XMM1
MULPS XMM2, XMM0
MOVUPS[v10], XMM2
MULPS XMM3, XMM0
MOVUPS[v11], XMM3
MULPS XMM4, XMM0
MOVUPS[v12], XMM4
MOV EAX, pv5
MOV EBX, pv6
MOV ECX, pv7
MOV EDX, pv8
MOVUPS XMM1, [EAX]
MOVUPS XMM2, [EBX]
MOVUPS XMM3, [ECX]
MOVUPS XMM4, [EDX]
MOV EAX, pv4
MOVUPS XMM0, [EAX]
MULPS XMM1, XMM0
MOVUPS[v13], XMM1
MULPS XMM2, XMM0
MOVUPS[v14], XMM2
MULPS XMM3, XMM0
MOVUPS[v15], XMM3
MULPS XMM4, XMM0
MOVUPS[v16], XMM4
}
ret._M[][] = Sum(v1);
ret._M[][] = Sum(v2);
ret._M[][] = Sum(v3);
ret._M[][] = Sum(v4);
ret._M[][] = Sum(v5);
ret._M[][] = Sum(v6);
ret._M[][] = Sum(v7);
ret._M[][] = Sum(v8);
ret._M[][] = Sum(v9);
ret._M[][] = Sum(v10);
ret._M[][] = Sum(v11);
ret._M[][] = Sum(v12);
ret._M[][] = Sum(v13);
ret._M[][] = Sum(v14);
ret._M[][] = Sum(v15);
ret._M[][] = Sum(v16);
return;
}