#ifndef _TMSSE_MATRIX_HPP
#define _TMSSE_MATRIX_HPP

#include <sstream>
#include <xmmintrin.h>
#include <cstddef>//std::size_t
#include <cstdlib>
#include <cmath>
#include "../vector/matrix_base.hpp"
#include "sse_vector.hpp"

#if defined(_MSC_VER) || defined(__ICC)
//Code with Micro$oft compiler
#define  IOF(m,i) (m) ## .m128_f32[i]
#define CIOF(m,i) (m) ## .m128_f32[i] 

#else

#define  IOF(m,i) reinterpret_cast<float*>(&(m))[i]
#define CIOF(m,i) reinterpret_cast<const float*>(&(m))[i]


//#define CCAST(m) const_cast<float*>(m) 
#define CCAST(m) (m) 

inline __m128 SUM_128(__m128 r){							// 4, 3, 2, 1 0123
	__m128 x = _mm_shuffle_ps(r,r,_MM_SHUFFLE(1,0,3,2));	// 2, 1, 4, 3 1032
	x = _mm_add_ps(x,r);									// 6, 4, 6, 4 1155
	r = _mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1));			// 4, 6, 4, 6 5511
	return _mm_add_ps(r,x);									//10,10,10,10 6666
}
	
inline __m128 CVT424(__m128 a,__m128 b,__m128 c,__m128 d){
	return _mm_setr_ps(CIOF(a,0),CIOF(b,1),CIOF(c,2),CIOF(d,3));
}




namespace tempest{
	
class sse_matrix;
	
class sse_matrix:public matrix_base< sse_matrix, float, 4, 4 >{
public:
	class prxy{
	public:
		prxy(__m128 & r):m(r){}
		float& operator[](std::size_t i)     { return  IOF(m,i);} 
		float  operator[](std::size_t i)const{return CIOF(m,i);} 
	private:
		__m128 & m;
	};

	class cst_prxy{
	public:
		cst_prxy(const __m128 & r):m(r){}
		float        operator[](std::size_t i)const{return CIOF(m,i);} 
	private:
		const __m128 & m;
	};
	
private:
	typedef		float					T;
public:
	typedef		T										value_type;		///< type of element
	typedef		T&										reference;		///< reference					
	typedef		const T&								const_reference;///< const reference
	typedef		sse_matrix								this_type;		///< self

	typedef		std::size_t								size_type;		///< type of element size
	typedef		std::ptrdiff_t							difference_type;///< difference type of pointer

	typedef		T*										iterator;		///< iterator
	typedef		const T*								const_iterator;	///< const iterator
	
	typedef 	T										param_type;		///< This is for parameter.
	typedef		const this_type &						param_this_type;
	
public:
	//-----------------------------------------------
	//size
	static const size_type row_size = 4;
	static const size_type col_size = 4;
	static const size_type c_size = 4*4;//16
	
public:
	//-----------------------------------------------
	//functions for iterator
	iterator		begin()			{return &( IOF(m[0],0));}
	iterator		end()			{return &( IOF(m[0],4));}
	const_iterator	begin()	const	{return &(CIOF(m[3],0));}
	const_iterator	end()	const	{return &(CIOF(m[3],4));}
	//....?? fuck hack 3 death....
	
public:
	sse_matrix(){}
	
	sse_matrix(const sse_matrix& rhs){
		m[0] = rhs.m[0];
		m[1] = rhs.m[1];
		m[2] = rhs.m[2];
		m[3] = rhs.m[3];	
	}
	
	explicit sse_matrix(const __m128 a,const __m128 b,const __m128 c, const __m128 d){
		m[0] = a;
		m[1] = b;
		m[2] = c;
		m[3] = d;	
	}
	
	explicit sse_matrix(const __m128 rhs[4]){
		m[0] = rhs[0];
		m[1] = rhs[1];
		m[2] = rhs[2];
		m[3] = rhs[3];	
	}
	
	
	
#define INS_MATRIX(NUM) m[ NUM ] = _mm_setr_ps (_m ## NUM ## 0,_m ## NUM ## 1,_m ## NUM ## 2,_m ## NUM ## 3);
	explicit sse_matrix(
		float _m00, float _m01, float _m02, float _m03,
		float _m10, float _m11, float _m12, float _m13,
		float _m20, float _m21, float _m22, float _m23,
		float _m30, float _m31, float _m32, float _m33
	){
		INS_MATRIX(0)
		INS_MATRIX(1)
		INS_MATRIX(2)
		INS_MATRIX(3)	
	}
#undef INS_MATRIX

	
	explicit sse_matrix(const float * array){
		m[0] = (_mm_loadu_ps(CCAST(array+ 0)));
		m[1] = (_mm_loadu_ps(CCAST(array+ 4)));
		m[2] = (_mm_loadu_ps(CCAST(array+ 8)));
		m[3] = (_mm_loadu_ps(CCAST(array+ 12)));
	}
	
	explicit sse_matrix(const float rhs[row_size][col_size]){
		m[0] = (_mm_loadu_ps(CCAST(rhs[0])));
		m[1] = (_mm_loadu_ps(CCAST(rhs[1])));
		m[2] = (_mm_loadu_ps(CCAST(rhs[2])));
		m[3] = (_mm_loadu_ps(CCAST(rhs[3])));
	}	
	
	
	~sse_matrix(){}
	
	//-----------------------------------------------
	//inserters
	this_type& operator = (const __m128 rhs[4]){
		m[0] = rhs[0];
		m[1] = rhs[1];
		m[2] = rhs[2];
		m[3] = rhs[3];
    	return *this;
	}
	
	this_type& operator = (const sse_matrix& rhs){
		m[0] = rhs.m[0];
		m[1] = rhs.m[1];
		m[2] = rhs.m[2];
		m[3] = rhs.m[3];
    	return *this;
	}
	
	//-----------------------------------------------
	//capacity
	size_type size ()     const { return c_size; }
    size_type max_size () const { return c_size; }
    bool      empty ()    const { return false;	 }
	
	//-----------------------------------------------
	//operators
	
	this_type& negate(){
		m[0] = _mm_sub_ps(_mm_setzero_ps(),m[0]);
		m[1] = _mm_sub_ps(_mm_setzero_ps(),m[1]);
		m[2] = _mm_sub_ps(_mm_setzero_ps(),m[2]);
		m[3] = _mm_sub_ps(_mm_setzero_ps(),m[3]);
		return *this;
	}
	
	this_type& operator += (param_this_type rhs){
		m[0] = _mm_add_ps(m[0],rhs.m[0]);
		m[1] = _mm_add_ps(m[1],rhs.m[1]);
		m[2] = _mm_add_ps(m[2],rhs.m[2]);
		m[3] = _mm_add_ps(m[3],rhs.m[3]);
		return *this;
	}
	
	this_type& operator -= (param_this_type rhs){
		m[0] = _mm_sub_ps(m[0],rhs.m[0]);
		m[1] = _mm_sub_ps(m[1],rhs.m[1]);
		m[2] = _mm_sub_ps(m[2],rhs.m[2]);
		m[3] = _mm_sub_ps(m[3],rhs.m[3]);
		return *this;
	}
	
	this_type& operator *= (param_this_type rhs){
		__m128 a = rhs.m[0];
		__m128 b = rhs.m[1];
		__m128 c = rhs.m[2];
		__m128 d = rhs.m[3];
		
		_MM_TRANSPOSE4_PS(a,b,c,d);
		
		m[0] = CVT424(SUM_128(_mm_mul_ps(m[0],a)),SUM_128(_mm_mul_ps(m[0],b)),SUM_128(_mm_mul_ps(m[0],c)),SUM_128(_mm_mul_ps(m[0],d)));
		m[1] = CVT424(SUM_128(_mm_mul_ps(m[1],a)),SUM_128(_mm_mul_ps(m[1],b)),SUM_128(_mm_mul_ps(m[1],c)),SUM_128(_mm_mul_ps(m[1],d)));
		m[2] = CVT424(SUM_128(_mm_mul_ps(m[2],a)),SUM_128(_mm_mul_ps(m[2],b)),SUM_128(_mm_mul_ps(m[2],c)),SUM_128(_mm_mul_ps(m[2],d)));
		m[3] = CVT424(SUM_128(_mm_mul_ps(m[3],a)),SUM_128(_mm_mul_ps(m[3],b)),SUM_128(_mm_mul_ps(m[3],c)),SUM_128(_mm_mul_ps(m[3],d)));
		
		return *this;
	}
	
	this_type mul(param_this_type rhs)const{
		__m128 a = rhs.m[0];
		__m128 b = rhs.m[1];
		__m128 c = rhs.m[2];
		__m128 d = rhs.m[3];
		
		_MM_TRANSPOSE4_PS(a,b,c,d);
		
		return sse_matrix(
			CVT424(SUM_128(_mm_mul_ps(m[0],a)),SUM_128(_mm_mul_ps(m[0],b)),SUM_128(_mm_mul_ps(m[0],c)),SUM_128(_mm_mul_ps(m[0],d))),
			CVT424(SUM_128(_mm_mul_ps(m[1],a)),SUM_128(_mm_mul_ps(m[1],b)),SUM_128(_mm_mul_ps(m[1],c)),SUM_128(_mm_mul_ps(m[1],d))),
			CVT424(SUM_128(_mm_mul_ps(m[2],a)),SUM_128(_mm_mul_ps(m[2],b)),SUM_128(_mm_mul_ps(m[2],c)),SUM_128(_mm_mul_ps(m[2],d))),
			CVT424(SUM_128(_mm_mul_ps(m[3],a)),SUM_128(_mm_mul_ps(m[3],b)),SUM_128(_mm_mul_ps(m[3],c)),SUM_128(_mm_mul_ps(m[3],d)))
		);
		
	}
	
	
	this_type& operator *= (float rhs){
		__m128 rr = _mm_set1_ps(rhs);
		m[0] = _mm_mul_ps(m[0],rr);
		m[1] = _mm_mul_ps(m[1],rr);
		m[2] = _mm_mul_ps(m[2],rr);
		m[3] = _mm_mul_ps(m[3],rr);	
		
		return *this;
	}
	
	this_type& operator /= (float rhs){
		__m128 x,r;
		
		x = _mm_set_ss(rhs);
		
		r = _mm_rcp_ss(x);	
		r = _mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2),_mm_mul_ss(x,r)));//R(i+1) = R * (2 - x * R)
		r = _mm_shuffle_ps(r,r,0);//ss->ps
		
		m[0] = _mm_mul_ps(m[0],r);
		m[1] = _mm_mul_ps(m[1],r);
		m[2] = _mm_mul_ps(m[2],r);
		m[3] = _mm_mul_ps(m[3],r);	
		//m = _mm_div_ps(m,_mm_load1_ps(&rhs));
		return *this;
	}
	
	//--------------------------------
	
	
	prxy operator[](size_type i){
		return prxy(m[i]);
	}
	
	cst_prxy operator[](size_type i) const {
		return cst_prxy(m[i]);
	}
	
	//-----------------------------------
	//
	
	__m128& get(int i){
		return m[i];
	}
	
	__m128 get(int i) const {
		return m[i];
	}	
	
	void get(float *array) const {
		_mm_storeu_ps(array +  0,m[0]);
		_mm_storeu_ps(array +  4,m[1]);
		_mm_storeu_ps(array +  8,m[2]);
		_mm_storeu_ps(array + 12,m[3]);
	}	
	//__m128 get() const {return m;}
	
	//void set(std::size_t index,float f){IOF(m,index) = f;}
	void set(const float * array){		
		m[0] = (_mm_loadu_ps(CCAST(array +  0)));
		m[1] = (_mm_loadu_ps(CCAST(array +  4)));
		m[2] = (_mm_loadu_ps(CCAST(array +  8)));
		m[3] = (_mm_loadu_ps(CCAST(array + 12)));
	
	}
	
	
	
	void assign(const float * a,const float * b){
		assert(b-a<=static_cast<int>(c_size));
		this->set(a);
	}
	
	

	//-----------------------------------------------
	// utilities
	this_type & transpose(){ 		
		_MM_TRANSPOSE4_PS(m[0],m[1],m[2],m[3]);
		return *this;
	}
	
	float det() const {//derminant
		//a/b/c
		register __m128 a[3],b[3];
		__m128 x;
		
		//m ...3210
		//---------------------------------------
		//0
		
		a[0] = _mm_shuffle_ps(m[1],m[1],_MM_SHUFFLE(1,2,3,0));//1
		a[1] = _mm_shuffle_ps(m[2],m[2],_MM_SHUFFLE(2,3,1,0));//2
		a[2] = _mm_shuffle_ps(m[3],m[3],_MM_SHUFFLE(3,1,2,0));//3
		//
		b[0] = _mm_shuffle_ps(m[1],m[1],_MM_SHUFFLE(3,1,2,0));//1
		b[1] = a[1];                                          //2
		b[2] = _mm_shuffle_ps(m[3],m[3],_MM_SHUFFLE(1,2,3,0));//3
		
		x = 
		_mm_mul_ps(
			_mm_shuffle_ps(m[0],m[0],_MM_SHUFFLE(0,0,0,0)) ,
			_mm_sub_ps( _mm_mul_ps(a[2],_mm_mul_ps(a[1],a[0])),_mm_mul_ps(b[2],_mm_mul_ps(b[1],b[0])) )//(a-b)
		); 
		//+
		
		//1
		a[0] = _mm_shuffle_ps(m[0],m[0],_MM_SHUFFLE(1,2,3,0));//0
		//a[1] = _mm_shuffle_ps(m[2],m[2],_MM_SHUFFLE(2,3,1,0));//2
		//a[2] = _mm_shuffle_ps(m[3],m[3],_MM_SHUFFLE(3,1,2,0));//3
		//
		b[0] = _mm_shuffle_ps(m[0],m[0],_MM_SHUFFLE(3,1,2,0));//0
		//b[1] = _mm_shuffle_ps(m[2],m[2],_MM_SHUFFLE(2,3,1,0));//2
		//b[2] = _mm_shuffle_ps(m[3],m[3],_MM_SHUFFLE(1,2,3,0));//3
		
		x = 
		_mm_sub_ps(
			x,
			_mm_mul_ps(
				_mm_shuffle_ps(m[1],m[1],_MM_SHUFFLE(0,0,0,0)),
				_mm_sub_ps( _mm_mul_ps(a[2],_mm_mul_ps(a[1],a[0])),_mm_mul_ps(b[2],_mm_mul_ps(b[1],b[0])) )//(a-b)
			)		
		);//-
		
		//2
		//a[0] = _mm_shuffle_ps(m[0],m[0],_MM_SHUFFLE(1,2,3,0));//0
		a[1] = _mm_shuffle_ps(m[1],m[1],_MM_SHUFFLE(2,3,1,0));//1
		//a[2] = _mm_shuffle_ps(m[3],m[3],_MM_SHUFFLE(3,1,2,0));//3
		//
		//b[0] = _mm_shuffle_ps(m[0],m[0],_MM_SHUFFLE(3,1,2,0));//0
		b[1] = a[1];//1
		//b[2] = _mm_shuffle_ps(m[3],m[3],_MM_SHUFFLE(1,2,3,0));//3
		
		x = 
		_mm_add_ps(
			x,
			_mm_mul_ps(
				_mm_shuffle_ps(m[2],m[2],_MM_SHUFFLE(0,0,0,0)),
				_mm_sub_ps( _mm_mul_ps(a[2],_mm_mul_ps(a[1],a[0])),_mm_mul_ps(b[2],_mm_mul_ps(b[1],b[0])) )//(a-b)
			)		
		);//+
		
		//3
		//a[0] = _mm_shuffle_ps(m[0],m[0],_MM_SHUFFLE(1,2,3,0));//0
		//a[1] = _mm_shuffle_ps(m[1],m[1],_MM_SHUFFLE(2,3,1,0));//1
		a[2] = _mm_shuffle_ps(m[2],m[2],_MM_SHUFFLE(3,1,2,0));//2
		//
		//b[0] = _mm_shuffle_ps(m[0],m[0],_MM_SHUFFLE(3,1,2,0));//0
		//b[1] = _mm_shuffle_ps(m[1],m[1],_MM_SHUFFLE(2,3,1,0));//1
		b[2] = _mm_shuffle_ps(m[2],m[2],_MM_SHUFFLE(1,2,3,0));//2
		
		x = 
		_mm_sub_ps(
			x,
			_mm_mul_ps(
				_mm_shuffle_ps(m[3],m[3],_MM_SHUFFLE(0,0,0,0)),
				_mm_sub_ps( _mm_mul_ps(a[2],_mm_mul_ps(a[1],a[0])),_mm_mul_ps(b[2],_mm_mul_ps(b[1],b[0])) )//(a-b)
			)		
		);//-
		
		
		//---------------------------------------
		IOF(x,0) = 0.0f;//last is 0 ../ 0123
		x = SUM_128(x);
		
		return IOF(x,0);
		
		
	}
	//...
	
	
	bool is_invertible() const{// nonsingular
		return (this->det() != 0);//|M| != 0	
	}

public:
	const char* debug()const{return "tempest::sse_matrix";}

	
public://not privat
	__m128 m[4];
};

inline const sse_matrix&
	operator+ (const sse_matrix& rhs){
	return rhs;
}
	
inline sse_matrix operator- (const sse_matrix& rhs){
	return sse_matrix(rhs).negate();
}


#define DACLARE_OP(OP)																\
inline sse_matrix operator OP (const sse_matrix& lhs, const sse_matrix& rhs){		\
	return sse_matrix(lhs) OP ## = rhs;												\
}

	DACLARE_OP(+)
	DACLARE_OP(-)

#undef DACLARE_OP
	
inline sse_matrix operator* (const sse_matrix& lhs, const sse_matrix& rhs){
	return lhs.mul(rhs);											
}

inline sse_matrix operator * (const sse_matrix& lhs, float rhs){
	return sse_matrix(lhs) *= rhs;
}
inline sse_matrix operator * (float rhs, const sse_matrix& lhs){
	return sse_matrix(lhs) *= rhs;
}

inline sse_matrix operator / (const sse_matrix& lhs, float rhs){
	return sse_matrix(lhs) /= rhs;
}





//-----------------------------------------------
//compare
template<class T>
inline bool operator== (const sse_matrix& lhs, const sse_matrix& rhs){
	return (lhs.get(0) == rhs.get(0))&&(lhs.get(1) == rhs.get(1))&&(lhs.get(2) == rhs.get(2))&&(lhs.get(3) == rhs.get(3));
}

template<class T>
inline bool operator!= (const sse_matrix& lhs, const sse_matrix& rhs){
	return (lhs.get(0) != rhs.get(0))||(lhs.get(1) != rhs.get(1))||(lhs.get(2) != rhs.get(2))||(lhs.get(3) != rhs.get(3));
}

//-----------------------------------------------
//output
/** 
 *	@name output
 */
//@{
	
/** 
 *	ostream << 
 */
template<typename _CharT, class _Traits>
std::basic_ostream<_CharT, _Traits>& operator<<(std::basic_ostream<_CharT, _Traits>& os, const sse_matrix& rhs){
	
	float f[16];
	
	rhs.get(f);
	std::basic_ostringstream<_CharT, _Traits> s;
	s.flags(os.flags());
	s.imbue(os.getloc());
	s.precision(os.precision());
	s << "(";
	for(std::size_t i=0;i<4;i++){
		s <<"(";
		for(std::size_t j = 0;j < 4;++j){
			s << f[4*i+j];
			if(j != 3){s << ",";}
		}
		s <<")";
		if(i != 3){s << ",";}
	}		
	s << ")";
	return os << s.str();
}

//@}
	
#undef IOF
#undef CIOF

}//End of namespace

#endif

