
#include <xmmintrin.h>
#include <cstddef>
#include <cstring>


float fx(const __m128 & m,std::size_t i){
	return reinterpret_cast<const float*>(&m)[i];	
}

void fx2(__m128 * out,const __m128 & m,std::size_t i){
	reinterpret_cast<float*>(out)[i] = reinterpret_cast<const float*>(&m)[i];	
}

#define  IOF(m,i) reinterpret_cast<float*>(&(m))[i]
#define CIOF(m,i) reinterpret_cast<const float*>(&(m))[i]

#define DEC(K_,J_) 																						\
	u = _mm_sub_ps( _mm_setzero_ps(),_mm_mul_ps(z[J_],r) );/*  -IOF(z[J_],K_)*IOF(r,K_)  */				\
	z[J_] = _mm_sub_ps(z[J_],_mm_mul_ps(z[K_],_mm_shuffle_ps(z[J_],z[J_],_MM_SHUFFLE(K_,K_,K_,K_))) );	\
	IOF(z[J_],K_) = IOF(u,K_);

#define DIV_RCP(RET,XXX)											\
	r = _mm_rcp_ps(XXX);											\
	r = _mm_mul_ps(r,_mm_sub_ps(_mm_set1_ps(2),_mm_mul_ps(XXX,r)));	\
	RET = _mm_mul_ps(RET,r);

void fx3(__m128 z[4],const __m128 rhs[4]){
	__m128 t,u,r;
	z[0] = rhs[0];
	z[1] = rhs[1];
	z[2] = rhs[2];
	z[3] = rhs[3];
	
	
	//k = 0;
	//tx = IOF(z[0],0);
	t = _mm_shuffle_ps(z[0],z[0],_MM_SHUFFLE(0,0,0,0));
	DIV_RCP(z[0],t)
	//
	IOF(z[0],0) = IOF(r,0);
		//------
		DEC(0,1)
		DEC(0,2)
		DEC(0,3)
	
	//k = 1;
	//tx = IOF(z[1],1);
	t = _mm_shuffle_ps(z[1],z[1],_MM_SHUFFLE(1,1,1,1));
	DIV_RCP(z[1],t)
	IOF(z[1],1) = IOF(r,1);
		//------
		DEC(1,0)
		DEC(1,2)
		DEC(1,3)
	
	//k = 2;
	//tx = IOF(z[2],2);
	t = _mm_shuffle_ps(z[2],z[2],_MM_SHUFFLE(2,2,2,2));
	DIV_RCP(z[2],t)
	IOF(z[2],2) = IOF(r,2);
		//------
		DEC(2,0)
		DEC(2,1)
		DEC(2,3)
	
	//k = 3;
	//tx = IOF(z[3],3);
	t = _mm_shuffle_ps(z[3],z[3],_MM_SHUFFLE(3,3,3,3));
	DIV_RCP(z[3],t)
	IOF(z[3],3) = IOF(r,3);
		//------
		DEC(3,0)
		DEC(3,1)
		DEC(3,2)
	
}
	
#undef DEC
#undef DIV_RCP


void invert (float a[4][4], float rhs[4][4]){
	std::size_t i, j, k;
	float t, u, det;
	
	std::memcpy(a,rhs,sizeof(float)*4*4);	

	det = 1;
	for (k = 0; k < 4; k++) {
		t = a[k][k];  det *= t;
		for (i = 0; i < 4; i++) a[k][i] /= t;
		a[k][k] = 1.0f / t;
		for (j = 0; j < 4; j++){
			if (j != k) {
				u = a[j][k];
				for (i = 0; i < 4; i++)
					if (i != k) a[j][i] -= a[k][i] * u;
					else        a[j][i] = -u / t;
			}
		}
		
	}
	//return det;
}






