#ifndef		__T_COMP_TEMPLATE_SSE2_H_INCLUDE_
#define		__T_COMP_TEMPLATE_SSE2_H_INCLUDE_

namespace t_image_engine{

#ifdef USING_SSE2
static const __m128i m128izero_ = {0,0,0,0};

template < class _SRCT, class _DSTT, int PIXELSIZE > 
class t_comp_uc_sse2{

public:
	
	//! adds 128bit 1byte * 16
	static inline void add(_DSTT* d, const _SRCT* s, int width)
	{
		unsigned char* cd = reinterpret_cast<unsigned char*>(d);
		const unsigned char* cs = reinterpret_cast<const unsigned char*>(s);
		int w = width * PIXELSIZE;
		for(int x = 0; x < w/16; x++, cd+=16, cs+=16){
			_mm_storeu_si128((__m128i*)cd, _mm_adds_epu8(
				_mm_loadu_si128((__m128i*)cd), _mm_loadu_si128((__m128i*)cs)));
		}
		int mod = w&0xf;
		if(mod){
			memcpy(cd, & _mm_adds_epu8(
				_mm_loadu_si128((__m128i*)cd),  _mm_loadu_si128((__m128i*)cs)), mod);	
		}
	}

	// subs 128bit 1byte * 16
	static inline void sub(_DSTT* d, const _SRCT* s, int width)
	{
		unsigned char* cd = reinterpret_cast<unsigned char*>(d);
		const unsigned char* cs = reinterpret_cast<const unsigned char*>(s);
		int w = width * PIXELSIZE;
		for(int x = 0; x < w/16; x++, cd+=16, cs+=16){
			_mm_storeu_si128((__m128i*)cd, _mm_subs_epu8(
				_mm_loadu_si128((__m128i*)cd), _mm_loadu_si128((__m128i*)cs)));
		}
		int mod = w&0xf;
		if(mod){
			memcpy(cd, &_mm_subs_epu8(
				_mm_loadu_si128((__m128i*)cd), _mm_loadu_si128((__m128i*)cs)), mod);
		}
	}

	// multi 128bit 1byte * 16
	static inline void multi(_DSTT* d, const _SRCT* s, int width)
	{
		unsigned char* cd = reinterpret_cast<unsigned char*>(d);
		const unsigned char* cs = reinterpret_cast<const unsigned char*>(s);
		int w = width * PIXELSIZE;
		__m128i mcd, mcs, ms1, ms2, md1, md2, dst;
		for(int x = 0; x < w/16; x++, cd+=16, cs+=16){
			mcd = _mm_loadu_si128((__m128i*)cd);
			mcs = _mm_loadu_si128((__m128i*)cs);
			ms1 = _mm_unpacklo_epi8(mcs, m128izero_);
			ms2 = _mm_unpackhi_epi8(mcs, m128izero_);
			md1 = _mm_unpacklo_epi8(mcd, m128izero_);
			md2 = _mm_unpackhi_epi8(mcd, m128izero_);
			_mm_storeu_si128((__m128i*)cd, 
				_mm_packus_epi16(_mm_mullo_epi16(ms1, md1), _mm_mullo_epi16(ms2, md2)));
		}
		int mod = w&0xf;
		if(mod){
			mcd = _mm_loadu_si128((__m128i*)cd);
			mcs = _mm_loadu_si128((__m128i*)cs);
			ms1 = _mm_unpacklo_epi8(mcs, m128izero_);
			ms2 = _mm_unpackhi_epi8(mcs, m128izero_);
			md1 = _mm_unpacklo_epi8(mcd, m128izero_);
			md2 = _mm_unpackhi_epi8(mcd, m128izero_);
			dst = _mm_packus_epi16(_mm_mullo_epi16(ms1, md1), _mm_mullo_epi16(ms2, md2));
			memcpy(cd, &dst, mod);
		}
	}

	// and 128bit 1byte * 16
	static inline void and(_DSTT* d, const _SRCT* s, int width)
	{
		unsigned char* cd = reinterpret_cast<unsigned char*>(d);
		const unsigned char* cs = reinterpret_cast<const unsigned char*>(s);
		int w = width * PIXELSIZE;
		for(int x = 0; x < w/16; x++, cd+=16, cs+=16){
			_mm_storeu_si128((__m128i*)cd,_mm_and_si128(
				_mm_loadu_si128((__m128i*)cd), _mm_loadu_si128((__m128i*)cs)));
		}
		int mod = w&0xf;
		if(mod){
			memcpy(cd, &_mm_and_si128(
				_mm_loadu_si128((__m128i*)cd), _mm_loadu_si128((__m128i*)cs)), mod);
		}
	}
	
};

#endif

}

#endif