/* FrameReconstructor_MMX.c */
/* 2009/07/09               */

#include "StdAfx.h"

#include "FrameReconstructor.h"

#include "MotionComp_MMX.h"

/* */

#pragma warning(disable : 4799)

/* */

static __inline void Transpose_MMX(
	const INT16* x,
	INT16*       y)
{
	const __m64* X = (const __m64*)x;
	__m64*       Y = (__m64*)y;

	__m64 t0, t1, t2, t3;
	__m64 u0, u1, u2, u3;

	/* */

	t0 = X[2 * 0];
	t1 = X[2 * 1];
	t2 = X[2 * 2];
	t3 = X[2 * 3];

	u0 = _mm_unpacklo_pi16(t0, t1);
	u1 = _mm_unpackhi_pi16(t0, t1);
	u2 = _mm_unpacklo_pi16(t2, t3);
	u3 = _mm_unpackhi_pi16(t2, t3);

	Y[2 * 0] = _mm_unpacklo_pi32(u0, u2);
	Y[2 * 1] = _mm_unpackhi_pi32(u0, u2);
	Y[2 * 2] = _mm_unpacklo_pi32(u1, u3);
	Y[2 * 3] = _mm_unpackhi_pi32(u1, u3);

	/* */

	t0 = X[2 * 0 + 1];
	t1 = X[2 * 1 + 1];
	t2 = X[2 * 2 + 1];
	t3 = X[2 * 3 + 1];

	u0 = _mm_unpacklo_pi16(t0, t1);
	u1 = _mm_unpackhi_pi16(t0, t1);
	u2 = _mm_unpacklo_pi16(t2, t3);
	u3 = _mm_unpackhi_pi16(t2, t3);

	Y[2 * 4] = _mm_unpacklo_pi32(u0, u2);
	Y[2 * 5] = _mm_unpackhi_pi32(u0, u2);
	Y[2 * 6] = _mm_unpacklo_pi32(u1, u3);
	Y[2 * 7] = _mm_unpackhi_pi32(u1, u3);

	/* */

	t0 = X[2 * 4];
	t1 = X[2 * 5];
	t2 = X[2 * 6];
	t3 = X[2 * 7];

	u0 = _mm_unpacklo_pi16(t0, t1);
	u1 = _mm_unpackhi_pi16(t0, t1);
	u2 = _mm_unpacklo_pi16(t2, t3);
	u3 = _mm_unpackhi_pi16(t2, t3);

	Y[2 * 0 + 1] = _mm_unpacklo_pi32(u0, u2);
	Y[2 * 1 + 1] = _mm_unpackhi_pi32(u0, u2);
	Y[2 * 2 + 1] = _mm_unpacklo_pi32(u1, u3);
	Y[2 * 3 + 1] = _mm_unpackhi_pi32(u1, u3);

	/* */

	t0 = X[2 * 4 + 1];
	t1 = X[2 * 5 + 1];
	t2 = X[2 * 6 + 1];
	t3 = X[2 * 7 + 1];

	u0 = _mm_unpacklo_pi16(t0, t1);
	u1 = _mm_unpackhi_pi16(t0, t1);
	u2 = _mm_unpacklo_pi16(t2, t3);
	u3 = _mm_unpackhi_pi16(t2, t3);

	Y[2 * 4 + 1] = _mm_unpacklo_pi32(u0, u2);
	Y[2 * 5 + 1] = _mm_unpackhi_pi32(u0, u2);
	Y[2 * 6 + 1] = _mm_unpacklo_pi32(u1, u3);
	Y[2 * 7 + 1] = _mm_unpackhi_pi32(u1, u3);
}

void QT_UpdateDequantizeMatrix_MMX(
	FrameDecoder_t* t)
{
	FrameReconstructor_SSE2_t* r = t->Reconstructor;

	INT32 q, i, p;

	for (q = 0; q < t->Header.NQIS; q++) {
		for (i = 0; i < 2; i++) {
			for (p = 0; p < 3; p++) {
				const INT16* x = t->Dequantize[q].Matrix[i][p];
				INT16*       y = r->Matrix[q][i][p];
				Transpose_MMX(x, y);
			}
		}
	}
}

/* */

static __inline void Block_CopyPlane8x8_MMX(
	Plane_t* p,
	INT32    x,
	INT32    y,
	Plane_t* r)
{
	const UINT8* s = r->Plane + y * r->Pitch + x;
	UINT8*       d = p->Plane + y * p->Pitch + x;

	__m64 s0, s1, s2, s3, s4, s5, s6, s7;

	s0 = *((const __m64*)s); s += r->Pitch;
	s1 = *((const __m64*)s); s += r->Pitch;
	s2 = *((const __m64*)s); s += r->Pitch;
	s3 = *((const __m64*)s); s += r->Pitch;
	s4 = *((const __m64*)s); s += r->Pitch;
	s5 = *((const __m64*)s); s += r->Pitch;
	s6 = *((const __m64*)s); s += r->Pitch;
	s7 = *((const __m64*)s);

	*((__m64*)d) = s0; d += p->Pitch;
	*((__m64*)d) = s1; d += p->Pitch;
	*((__m64*)d) = s2; d += p->Pitch;
	*((__m64*)d) = s3; d += p->Pitch;
	*((__m64*)d) = s4; d += p->Pitch;
	*((__m64*)d) = s5; d += p->Pitch;
	*((__m64*)d) = s6; d += p->Pitch;
	*((__m64*)d) = s7;
}

static __inline void Block_CopyPlane16x16_MMX(
	Plane_t* p,
	INT32    x,
	INT32    y,
	Plane_t* r)
{
	const UINT8* s = r->Plane + y * r->Pitch + x;
	const UINT8* e = s + 16 * r->Pitch;
	UINT8*       d = p->Plane + y * p->Pitch + x;

	__m64 s00, s10, s20, s30;
	__m64 s01, s11, s21, s31;

	while (s < e) {
		s00 = *((const __m64*)(s + 0));
		s01 = *((const __m64*)(s + 8)); s += r->Pitch;
		s10 = *((const __m64*)(s + 0));
		s11 = *((const __m64*)(s + 8)); s += r->Pitch;
		s20 = *((const __m64*)(s + 0));
		s21 = *((const __m64*)(s + 8)); s += r->Pitch;
		s30 = *((const __m64*)(s + 0));
		s31 = *((const __m64*)(s + 8)); s += r->Pitch;

		*((__m64*)(d + 0)) = s00;
		*((__m64*)(d + 8)) = s01; d += p->Pitch;
		*((__m64*)(d + 0)) = s10;
		*((__m64*)(d + 8)) = s11; d += p->Pitch;
		*((__m64*)(d + 0)) = s20;
		*((__m64*)(d + 8)) = s21; d += p->Pitch;
		*((__m64*)(d + 0)) = s30;
		*((__m64*)(d + 8)) = s31; d += p->Pitch;
	}
}

/* */

ALIGN(0x10) static const UINT16 IPRED[4] = {
	128, 128, 128, 128
};

static __inline void Block_CopyIntra8x8_MMX(
	Plane_t*     p,
	INT32        x,
	INT32        y,
	const INT16* c)
{
	UINT8* d = p->Plane + y * p->Pitch + x;

	const __m64* B = (const __m64*)IPRED;
	const __m64* C = (const __m64*)c;

	__m64 s00, s01, s10, s11;
	const __m64 z = _mm_setzero_si64();

	s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 0 + 0], B[0]), z);
	s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 0 + 1], B[0]), z);
	s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 1 + 0], B[0]), z);
	s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 1 + 1], B[0]), z);

	*((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
	*((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
	*((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
	*((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11); d += p->Pitch;

	s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 2 + 0], B[0]), z);
	s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 2 + 1], B[0]), z);
	s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 3 + 0], B[0]), z);
	s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 3 + 1], B[0]), z);

	*((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
	*((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
	*((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
	*((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11); d += p->Pitch;

	s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 4 + 0], B[0]), z);
	s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 4 + 1], B[0]), z);
	s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 5 + 0], B[0]), z);
	s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 5 + 1], B[0]), z);

	*((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
	*((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
	*((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
	*((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11); d += p->Pitch;

	s00 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 6 + 0], B[0]), z);
	s01 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 6 + 1], B[0]), z);
	s10 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 7 + 0], B[0]), z);
	s11 = _mm_packs_pu16(_mm_adds_pi16(C[2 * 7 + 1], B[0]), z);

	*((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s00);
	*((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s01); d += p->Pitch;
	*((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s10);
	*((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s11);
}

static __inline void Block_ReviseInter8x8_MMX(
	Plane_t*     p,
	INT32        x,
	INT32        y,
	const INT16* c)
{
	UINT8* d = p->Plane + y * p->Pitch + x;
	UINT8* e = d + 8 * p->Pitch;

	const __m64* C = (const __m64*)c;

	__m64 b0, b1;
	__m64 s0, s1;
	const __m64 z = _mm_setzero_si64();

	for (; d < e; d += p->Pitch, C += 2) {
		b0 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(d + 0))), z);
		b1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(d + 4))), z);

		s0 = _mm_packs_pu16(_mm_adds_pi16(C[0], b0), z);
		s1 = _mm_packs_pu16(_mm_adds_pi16(C[1], b1), z);

		*((UINT32*)(d + 0)) = _mm_cvtsi64_si32(s0);
		*((UINT32*)(d + 4)) = _mm_cvtsi64_si32(s1);
	}
}

/* */

ALIGN(0x10) static const UINT16 COS_MMX[8][4] = {
	{     8,     8,     8,     8 }, /* 0 */
	{ 64277, 64277, 64277, 64277 }, /* 1 */
	{ 60547, 60547, 60547, 60547 }, /* 2 */
	{ 54491, 54491, 54491, 54491 }, /* 3 */
	{ 46341, 46341, 46341, 46341 }, /* 4 */
	{ 36410, 36410, 36410, 36410 }, /* 5 */
	{ 25080, 25080, 25080, 25080 }, /* 6 */
	{ 12785, 12785, 12785, 12785 }  /* 7 */
};

#define MUL1(T,X) _mm_add_pi16(_mm_mulhi_pi16(X, C[T]), X)
#define MUL0(T,X) _mm_mulhi_pi16(X, C[T])

/* */

static __inline void IDCT_R_8_MMX(
	const INT16* x,
	INT16*       y)
{
	const __m64* C = (const __m64*)COS_MMX[0];
	const __m64* X = (const __m64*)x;
	const __m64* E = X + 2;
	__m64*       Y = (__m64*)y;

	__m64 s0;
	__m64 t0, t1, t2, t3, t4, t5, t6, t7;

	for (; X < E; X++, Y++) {
		/* Stage.1 */

		s0 = _mm_add_pi16(X[2 * 0], X[2 * 4]);
		t0 = MUL1(4, s0);

		s0 = _mm_sub_pi16(X[2 * 0], X[2 * 4]);
		t1 = MUL1(4, s0);

		t2 = _mm_sub_pi16(MUL0(6, X[2 * 2]), MUL1(2, X[2 * 6]));
		t3 = _mm_add_pi16(MUL1(2, X[2 * 2]), MUL0(6, X[2 * 6]));

		t4 = _mm_sub_pi16(MUL0(7, X[2 * 1]), MUL1(1, X[2 * 7]));
		t5 = _mm_sub_pi16(MUL1(3, X[2 * 5]), MUL1(5, X[2 * 3]));

		t6 = _mm_add_pi16(MUL1(5, X[2 * 5]), MUL1(3, X[2 * 3]));
		t7 = _mm_add_pi16(MUL1(1, X[2 * 1]), MUL0(7, X[2 * 7]));

		/* Stage.2 */

		s0 = _mm_sub_pi16(t4, t5);
		t4 = _mm_add_pi16(t4, t5);
		t5 = MUL1(4, s0);

		s0 = _mm_sub_pi16(t7, t6);
		t7 = _mm_add_pi16(t7, t6);
		t6 = MUL1(4, s0);

		/* Stage.3 */

		s0 = _mm_sub_pi16(t0, t3);
		t0 = _mm_add_pi16(t0, t3);

		t3 = _mm_sub_pi16(t1, t2);
		t1 = _mm_add_pi16(t1, t2);

		t2 = _mm_sub_pi16(t6, t5);
		t6 = _mm_add_pi16(t6, t5);

		/* Stage.4 */

		Y[2 * 0] = _mm_add_pi16(t0, t7);
		Y[2 * 1] = _mm_add_pi16(t1, t6);
		Y[2 * 2] = _mm_add_pi16(t3, t2);
		Y[2 * 3] = _mm_add_pi16(s0, t4);
		Y[2 * 4] = _mm_sub_pi16(s0, t4);
		Y[2 * 5] = _mm_sub_pi16(t3, t2);
		Y[2 * 6] = _mm_sub_pi16(t1, t6);
		Y[2 * 7] = _mm_sub_pi16(t0, t7);
	}
}

static __inline void IDCT_R_8_4_MMX(
	const INT16* x,
	INT16*       y)
{
	const __m64* C = (const __m64*)COS_MMX[0];
	const __m64* X = (const __m64*)x;
	__m64*       Y = (__m64*)y;

	__m64 s0;
	__m64 t0, t1, t2, t3, t4, t5, t6, t7;

	/* Stage.1 */

	t1 = t0 = MUL1(4, X[2 * 0]);

	t2 = MUL0(6, X[2 * 2]);
	t3 = MUL1(2, X[2 * 2]);

	t4 = MUL0(7, X[2 * 1]);
	t5 = _mm_sub_pi16(_mm_setzero_si64(), MUL1(5, X[2 * 3]));

	t6 = MUL1(3, X[2 * 3]);
	t7 = MUL1(1, X[2 * 1]);

	/* Stage.2 */

	s0 = _mm_sub_pi16(t4, t5);
	t4 = _mm_add_pi16(t4, t5);
	t5 = MUL1(4, s0);

	s0 = _mm_sub_pi16(t7, t6);
	t7 = _mm_add_pi16(t7, t6);
	t6 = MUL1(4, s0);

	/* Stage.3 */

	s0 = _mm_sub_pi16(t0, t3);
	t0 = _mm_add_pi16(t0, t3);

	t3 = _mm_sub_pi16(t1, t2);
	t1 = _mm_add_pi16(t1, t2);

	t2 = _mm_sub_pi16(t6, t5);
	t6 = _mm_add_pi16(t6, t5);

	/* Stage.4 */

	Y[2 * 0] = _mm_add_pi16(t0, t7);
	Y[2 * 1] = _mm_add_pi16(t1, t6);
	Y[2 * 2] = _mm_add_pi16(t3, t2);
	Y[2 * 3] = _mm_add_pi16(s0, t4);
	Y[2 * 4] = _mm_sub_pi16(s0, t4);
	Y[2 * 5] = _mm_sub_pi16(t3, t2);
	Y[2 * 6] = _mm_sub_pi16(t1, t6);
	Y[2 * 7] = _mm_sub_pi16(t0, t7);
}

static __inline void IDCT_C_8_MMX(
	const INT16* x,
	INT16*       y)
{
	const __m64* C = (const __m64*)COS_MMX[0];
	const __m64* X = (const __m64*)x;
	const __m64* E = X + 2;
	__m64*       Y = (__m64*)y;

	__m64 s0;
	__m64 t0, t1, t2, t3, t4, t5, t6, t7;

	for (; X < E; X++, Y++) {
		/* Stage.1 */

		s0 = _mm_add_pi16(X[2 * 0], X[2 * 4]);
		t0 = MUL1(4, s0);

		s0 = _mm_sub_pi16(X[2 * 0], X[2 * 4]);
		t1 = MUL1(4, s0);

		t2 = _mm_sub_pi16(MUL0(6, X[2 * 2]), MUL1(2, X[2 * 6]));
		t3 = _mm_add_pi16(MUL1(2, X[2 * 2]), MUL0(6, X[2 * 6]));

		t4 = _mm_sub_pi16(MUL0(7, X[2 * 1]), MUL1(1, X[2 * 7]));
		t5 = _mm_sub_pi16(MUL1(3, X[2 * 5]), MUL1(5, X[2 * 3]));

		t6 = _mm_add_pi16(MUL1(5, X[2 * 5]), MUL1(3, X[2 * 3]));
		t7 = _mm_add_pi16(MUL1(1, X[2 * 1]), MUL0(7, X[2 * 7]));

		/* Stage.2 */

		s0 = _mm_sub_pi16(t4, t5);
		t4 = _mm_add_pi16(t4, t5);
		t5 = MUL1(4, s0);

		s0 = _mm_sub_pi16(t7, t6);
		t7 = _mm_add_pi16(t7, t6);
		t6 = MUL1(4, s0);

		/* Stage.3 */

		s0 = _mm_sub_pi16(t0, t3);
		t0 = _mm_add_pi16(t0, t3);

		t3 = _mm_sub_pi16(t1, t2);
		t1 = _mm_add_pi16(t1, t2);

		t2 = _mm_sub_pi16(t6, t5);
		t6 = _mm_add_pi16(t6, t5);

		/* Stage.4 */

		Y[2 * 0] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t0, t7), C[0]), 4);
		Y[2 * 1] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t1, t6), C[0]), 4);
		Y[2 * 2] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t3, t2), C[0]), 4);
		Y[2 * 3] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(s0, t4), C[0]), 4);
		Y[2 * 4] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(s0, t4), C[0]), 4);
		Y[2 * 5] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t3, t2), C[0]), 4);
		Y[2 * 6] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t1, t6), C[0]), 4);
		Y[2 * 7] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t0, t7), C[0]), 4);
	}
}

static __inline void IDCT_C_8_4_MMX(
	const INT16* x,
	INT16*       y)
{
	const __m64* C = (const __m64*)COS_MMX[0];
	const __m64* X = (const __m64*)x;
	const __m64* E = X + 2;
	__m64*       Y = (__m64*)y;

	__m64 s0;
	__m64 t0, t1, t2, t3, t4, t5, t6, t7;

	for (; X < E; X++, Y++) {
		/* Stage.1 */

		t1 = t0 = MUL1(4, X[2 * 0]);

		t2 = MUL0(6, X[2 * 2]);
		t3 = MUL1(2, X[2 * 2]);

		t4 = MUL0(7, X[2 * 1]);
		t5 = _mm_sub_pi16(_mm_setzero_si64(), MUL1(5, X[2 * 3]));

		t6 = MUL1(3, X[2 * 3]);
		t7 = MUL1(1, X[2 * 1]);

		/* Stage.2 */

		s0 = _mm_sub_pi16(t4, t5);
		t4 = _mm_add_pi16(t4, t5);
		t5 = MUL1(4, s0);

		s0 = _mm_sub_pi16(t7, t6);
		t7 = _mm_add_pi16(t7, t6);
		t6 = MUL1(4, s0);

		/* Stage.3 */

		s0 = _mm_sub_pi16(t0, t3);
		t0 = _mm_add_pi16(t0, t3);

		t3 = _mm_sub_pi16(t1, t2);
		t1 = _mm_add_pi16(t1, t2);

		t2 = _mm_sub_pi16(t6, t5);
		t6 = _mm_add_pi16(t6, t5);

		/* Stage.4 */

		Y[2 * 0] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t0, t7), C[0]), 4);
		Y[2 * 1] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t1, t6), C[0]), 4);
		Y[2 * 2] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(t3, t2), C[0]), 4);
		Y[2 * 3] = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(s0, t4), C[0]), 4);
		Y[2 * 4] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(s0, t4), C[0]), 4);
		Y[2 * 5] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t3, t2), C[0]), 4);
		Y[2 * 6] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t1, t6), C[0]), 4);
		Y[2 * 7] = _mm_srai_pi16(_mm_add_pi16(_mm_sub_pi16(t0, t7), C[0]), 4);
	}
}

/* */

static __inline void DequantizeIDCT8x8_MMX(
	const INT16* block,
	const INT16* matrix,
	INT16*       coeff)
{
	ALIGN(0x10) INT16 c0[64];

	{ /* Dequantize */
		const __m64* b = (const __m64*)block;
		const __m64* m = (const __m64*)matrix;
		__m64*       d = (__m64*) c0;

		d[ 0] = _mm_mullo_pi16(b[ 0], m[ 0]);
		d[ 1] = _mm_mullo_pi16(b[ 1], m[ 1]);
		d[ 2] = _mm_mullo_pi16(b[ 2], m[ 2]);
		d[ 3] = _mm_mullo_pi16(b[ 3], m[ 3]);

		d[ 4] = _mm_mullo_pi16(b[ 4], m[ 4]);
		d[ 5] = _mm_mullo_pi16(b[ 5], m[ 5]);
		d[ 6] = _mm_mullo_pi16(b[ 6], m[ 6]);
		d[ 7] = _mm_mullo_pi16(b[ 7], m[ 7]);

		d[ 8] = _mm_mullo_pi16(b[ 8], m[ 8]);
		d[ 9] = _mm_mullo_pi16(b[ 9], m[ 9]);
		d[10] = _mm_mullo_pi16(b[10], m[10]);
		d[11] = _mm_mullo_pi16(b[11], m[11]);

		d[12] = _mm_mullo_pi16(b[12], m[12]);
		d[13] = _mm_mullo_pi16(b[13], m[13]);
		d[14] = _mm_mullo_pi16(b[14], m[14]);
		d[15] = _mm_mullo_pi16(b[15], m[15]);
	}

	/* iDCT Row */
	IDCT_R_8_MMX(c0, coeff);

	/* Transpose */
	Transpose_MMX(coeff, c0);

	/* iDCT Colum */
	IDCT_C_8_MMX(c0, coeff);
}

/* */

static __inline void DequantizeIDCT8x8_16_MMX(
	const INT16* block,
	const INT16* matrix,
	INT16*       coeff)
{
	ALIGN(0x10) INT16 c0[64];

	{ /* Dequantize */
		const __m64* b = (const __m64*)block;
		const __m64* m = (const __m64*)matrix;
		__m64*       d = (__m64*)c0;

		d[0 * 2] = _mm_mullo_pi16(b[0 * 2], m[0 * 2]);
		d[1 * 2] = _mm_mullo_pi16(b[1 * 2], m[1 * 2]);
		d[2 * 2] = _mm_mullo_pi16(b[2 * 2], m[2 * 2]);
		d[3 * 2] = _mm_mullo_pi16(b[3 * 2], m[3 * 2]);
	}

	/* iDCT Row */
	IDCT_R_8_4_MMX(c0, coeff);

	/* Transpose */
	Transpose_MMX(coeff, c0);

	/* iDCT Colum */
	IDCT_C_8_4_MMX(c0, coeff);
}

/* */

static __inline void DequantizeIDCT8x8_0_MMX(
	INT16        dc,
	const INT16* matrix,
	INT16*       coeff)
{
	INT16* d = coeff;
	INT16* e = d + 64;

	__m64 d0 = _mm_set1_pi16(((dc * matrix[0]) + 15) >> 5);

	for (; d < e; d += 16) {
		*((__m64*)(d +  0)) = d0;
		*((__m64*)(d +  4)) = d0;
		*((__m64*)(d +  8)) = d0;
		*((__m64*)(d + 12)) = d0;
	}
}

/* */

struct DecodeCoefficientsLeaf {

	INT32 EOB_Run;

	INT8*  Run;
	INT16* Coeff;

}; /* DecodeCoefficientsLeaf */

typedef struct DecodeCoefficientsLeaf DecodeCoefficientsLeaf_t;

struct DecodeCoefficientsContext {

	DecodeCoefficientsLeaf_t Leaf[64];

}; /* DecodeCoefficientsContext */

typedef struct DecodeCoefficientsContext DecodeCoefficientsContext_t;

ALIGN(0x10) static const UINT8 IZZ[64] = {
	 0,  8,  1,  2,  9, 16, 24, 17,
	10,  3,  4, 11, 18, 25, 32, 40,
	33, 26, 19, 12,  5,  6, 13, 20,
	27, 34, 41, 48, 56, 49, 42, 35,
	28, 21, 14,  7, 15, 22, 29, 36,
	43, 50, 57, 58, 51, 44, 37, 30,
	23, 31, 38, 45, 52, 59, 60, 53,
	46, 39, 47, 54, 61, 62, 55, 63
};

static INT32 DecodeCoefficients_MMX(
	FrameDecoder_t*              t,
	DecodeCoefficientsContext_t* ctx,
	INT16*                       block)
{
	const INT8* bi = IZZ;
	const INT8* ei = IZZ + 64;

	DecodeCoefficientsLeaf_t* leaf = ctx->Leaf;

	const __m64 z = _mm_setzero_si64();

	*((__m64*)(block + 0x00)) = z;
	*((__m64*)(block + 0x04)) = z;
	*((__m64*)(block + 0x08)) = z;
	*((__m64*)(block + 0x0c)) = z;

	*((__m64*)(block + 0x10)) = z;
	*((__m64*)(block + 0x14)) = z;
	*((__m64*)(block + 0x18)) = z;
	*((__m64*)(block + 0x1c)) = z;

	*((__m64*)(block + 0x20)) = z;
	*((__m64*)(block + 0x24)) = z;
	*((__m64*)(block + 0x28)) = z;
	*((__m64*)(block + 0x2c)) = z;

	*((__m64*)(block + 0x30)) = z;
	*((__m64*)(block + 0x34)) = z;
	*((__m64*)(block + 0x38)) = z;
	*((__m64*)(block + 0x3c)) = z;

	while (bi < ei) {
		if (leaf->EOB_Run > 0) {
			leaf->EOB_Run -= 1;
			break;

		} else {
			INT32 run   = *((leaf->Run  )++);
			INT32 coeff = *((leaf->Coeff)++);

			if (run < 0) {
				leaf->EOB_Run = coeff;

			} else {
				bi += run;

				block[*(bi++)] = coeff;

				leaf = ctx->Leaf + (bi - IZZ);
			}
		}
	}

	return bi - IZZ;
}

/* */

static void Reconstruct_IntraBlock(
	FrameDecoder_t*              t,
	Plane_t*                     p,
	INT32                        x,
	INT32                        y,
	INT16                        dc,
	INT32                        qi,
	INT32                        plane,
	Plane_t*                     r,
	DecodeCoefficientsContext_t* ctx)
{
	ALIGN(0x10) INT16 block[64 + 64];
	ALIGN(0x10) INT16 coeff[64];

	const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][0];

	INT32 cs;

	if (dc == NOT_CODED) {
		Block_CopyPlane8x8_MMX(p, x, y, r);
		return;
	}

	cs = DecodeCoefficients_MMX(t, ctx, block);

	if (cs > 10) {
		block[0] = dc;
		DequantizeIDCT8x8_MMX(block, mat[plane], coeff);

	} else if (cs > 1) {
		block[0] = dc;
		DequantizeIDCT8x8_16_MMX(block, mat[plane], coeff);

	} else {
		DequantizeIDCT8x8_0_MMX(dc, mat[plane], coeff);
	}

	Block_CopyIntra8x8_MMX(p, x, y, coeff);
}

/* */

static void Reconstruct_InterBlock(
	FrameDecoder_t*              t,
	Plane_t*                     p,
	INT32                        x,
	INT32                        y,
	INT16                        dc,
	INT32                        qi,
	INT32                        plane,
	Plane_t*                     r,
	DecodeCoefficientsContext_t* ctx)
{
	ALIGN(0x10) INT16 block[64 + 64];
	ALIGN(0x10) INT16 coeff[64];

	const INT16 (*mat)[64] = t->Reconstructor->Matrix[qi][1];

	INT32 cs;

	if (dc == NOT_CODED) {
		if (r != NULL) {
			Block_CopyPlane8x8_MMX(p, x, y, r);
		}
		return;
	}

	cs = DecodeCoefficients_MMX(t, ctx, block);

	if (cs > 10) {
		block[0] = dc;
		DequantizeIDCT8x8_MMX(block, mat[plane], coeff);

	} else if (cs > 1) {
		block[0] = dc;
		DequantizeIDCT8x8_16_MMX(block, mat[plane], coeff);

	} else {
		DequantizeIDCT8x8_0_MMX(dc, mat[plane], coeff);
	}

	Block_ReviseInter8x8_MMX(p, x, y, coeff);
}

/* */

static const INT8 S_PX[16] = {
	0*8, 1*8, 1*8, 0*8,
	0*8, 0*8, 1*8, 1*8,
	2*8, 2*8, 3*8, 3*8,
	3*8, 2*8, 2*8, 3*8
};

static const INT8 S_PY[16] = {
	0*8, 0*8, 1*8, 1*8,
	2*8, 3*8, 3*8, 2*8,
	2*8, 3*8, 3*8, 2*8,
	1*8, 1*8, 0*8, 0*8
};

static const INT8 M_PX[4] = {
	0*16, 0*16,
	1*16, 1*16
};

static const INT8 M_PY[4] = {
	0*16, 1*16,
	1*16, 0*16
};

/* */

static void Reconstruct_YPlane_MMX(
	FrameDecoder_t* t)
{
	INT32 x, y;

	INT32 sx = t->Index->SX[0] * 32;
	INT32 sy = t->Index->SY[0] * 32;

	INT32 mx = t->Index->MX * 16;
	INT32 my = t->Index->MY * 16;

	INT32 bx = t->Index->BX[0];

	const UINT16* bi = t->Index->BIndex[0];

	Plane_t* g = t->Frame[0];
	Plane_t* p = t->Frame[1];
	Plane_t* r = t->Frame[2];

	const UINT8*          mm = t->MBMode;
	const MotionVector_t* mv = t->MV;

	const UINT8* qi = t->BQI;

	ALIGN(0x10) DecodeCoefficientsContext_t ctx = { 0 };

	INT32 i;
	for (i = 0; i < 64; i++) {
		ctx.Leaf[i].Run   = t->BRun  [0][i];
		ctx.Leaf[i].Coeff = t->BCoeff[0][i];
	}

	for (y = 0; y < sy; y += 32) {
		for (x = 0; x < sx; x += 32) {
			INT32 i = 0;

			INT32 m;
			for (m = 0; m < 4; m++, i += 4) {
				INT32 x0 = x + M_PX[m];
				INT32 y0 = y + M_PY[m];
				if (x0 < mx && y0 < my) {
					switch (*mm) {
					case 0: /* INTER_NOMV */
						Block_CopyPlane16x16_MMX(p, x0, y0, r);

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, NULL, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, NULL, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, NULL, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, NULL, &ctx);
						break;

					case 1: /* INTRA */
						Reconstruct_IntraBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_IntraBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_IntraBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_IntraBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;

					case 2: /* INTER_MV */
					case 3: /* INTER_MV_LAST */
					case 4: /* INTER_MV_LAST2 */
						MotionComp_Block16x16_MMX(p, x0, y0, r, mv);

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;

					case 5: /* INTER_GOLDEN_NOMV */
						Block_CopyPlane16x16_MMX(p, x0, y0, g);

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;

					case 6: /* INTER_GOLDEN_MV */
						MotionComp_Block16x16_MMX(p, x0, y0, g, mv);

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;

					case 7: /* INTER_MV_FOUR */
					{
						const MotionVector_t* v = mv;

						const INT16* dc = t->DC + (x0 >> 3) + (y0 >> 3) * bx;

						if (dc[0] != NOT_CODED) {
							MotionComp_Block8x8Y_MMX(p, x0 + 0, y0 + 0, r, v++);
						}

						if (dc[1] != NOT_CODED) {
							MotionComp_Block8x8Y_MMX(p, x0 + 8, y0 + 0, r, v++);
						}

						if (dc[0 + bx] != NOT_CODED) {
							MotionComp_Block8x8Y_MMX(p, x0 + 0, y0 + 8, r, v++);
						}

						if (dc[1 + bx] != NOT_CODED) {
							MotionComp_Block8x8Y_MMX(p, x0 + 8, y0 + 8, r, v++);
						}

						Reconstruct_InterBlock(t, p, x + S_PX[i + 0], y + S_PY[i + 0], t->DC[bi[0]], qi[0], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 1], y + S_PY[i + 1], t->DC[bi[1]], qi[1], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 2], y + S_PY[i + 2], t->DC[bi[2]], qi[2], 0, r, &ctx);
						Reconstruct_InterBlock(t, p, x + S_PX[i + 3], y + S_PY[i + 3], t->DC[bi[3]], qi[3], 0, r, &ctx);
						break;
					}

					} /* switch */

					bi += 4;
					mm += 1;
					mv += 4;
					qi += 4;
				}
			}
		}
	}
}

/* */

static void Reconstruct_CPlane_MMX(
	FrameDecoder_t* t)
{
	INT32 x, y;

	INT32 sx = t->Index->SX[1] * 32;
	INT32 sy = t->Index->SY[1] * 32;

	INT32 mx = t->Index->MX * 8;
	INT32 my = t->Index->MY * 8;

	INT32 bx = t->Index->BX[1];

	Plane_t* g = t->Frame[0];
	Plane_t* p = t->Frame[1];
	Plane_t* r = t->Frame[2];

	const INT16* DC0 = t->DC + t->Index->BC[0];
	const INT16* DC1 = DC0   + t->Index->BC[1];

	const UINT8* m = t->BMode + t->Index->BC[0];

	const UINT8* qi0 = t->BQI + t->Index->BC[0];
	const UINT8* qi1 = qi0    + t->Index->BC[1];

	ALIGN(0x10) DecodeCoefficientsContext_t ctx[2] = { 0 };

	INT32 i;
	for (i = 0; i < 64; i++) {
		ctx[0].Leaf[i].Run   = t->BRun  [1][i];
		ctx[0].Leaf[i].Coeff = t->BCoeff[1][i];

		ctx[1].Leaf[i].Run   = t->BRun  [2][i];
		ctx[1].Leaf[i].Coeff = t->BCoeff[2][i];
	}

	for (y = 0; y < sy; y += 32) {
		for (x = 0; x < sx; x += 32) {
			INT32 i;
			for (i = 0; i < 16; i++) {
				INT32 xx = x + S_PX[i];
				INT32 yy = y + S_PY[i];

				if (xx < mx && yy < my) {
					INT32 idx = (xx >> 3) + (yy >> 3) * bx;

					switch (m[idx]) {
					case 0: /* INTER_NOMV */
						Block_CopyPlane8x8_MMX(p + 1, xx, yy, r + 1);
						Block_CopyPlane8x8_MMX(p + 2, xx, yy, r + 2);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, NULL, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, NULL, ctx + 1);
						break;

					case 1: /* INTRA */
						Reconstruct_IntraBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_IntraBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					case 2: /* INTER_MV */
					case 3: /* INTER_MV_LAST */
					case 4: /* INTER_MV_LAST2 */
						MotionComp_Block8x8C_MMX(p + 1, xx, yy, r + 1, t->MVC + idx);
						MotionComp_Block8x8C_MMX(p + 2, xx, yy, r + 2, t->MVC + idx);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					case 5: /* INTER_GOLDEN_NOMV */
						Block_CopyPlane8x8_MMX(p + 1, xx, yy, g + 1);
						Block_CopyPlane8x8_MMX(p + 2, xx, yy, g + 2);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					case 6: /* INTER_GOLDEN_MV */
						MotionComp_Block8x8C_MMX(p + 1, xx, yy, g + 1, t->MVC + idx);
						MotionComp_Block8x8C_MMX(p + 2, xx, yy, g + 2, t->MVC + idx);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					case 7: /* INTER_MV_FOUR */
						MotionComp_Block8x8C_MMX(p + 1, xx, yy, r + 1, t->MVC + idx);
						MotionComp_Block8x8C_MMX(p + 2, xx, yy, r + 2, t->MVC + idx);

						Reconstruct_InterBlock(t, p + 1, xx, yy, DC0[idx], *qi0, 1, r + 1, ctx + 0);
						Reconstruct_InterBlock(t, p + 2, xx, yy, DC1[idx], *qi1, 2, r + 2, ctx + 1);
						break;

					} /* switch */

					qi0++;
					qi1++;
				}
			}
		}
	}
}

/* */

struct MFilter {

	__m64 L;
	__m64 L2;

	__m64 NL;
	__m64 NL2;

}; /* MFilter */

typedef struct MFilter MFilter_t;

static void MFilter_Setup(MFilter_t* t, INT32 lim)
{
	const __m64 z = _mm_setzero_si64();

	__m64 l  = _mm_set1_pi16(lim);
	__m64 nl = _mm_sub_pi16(z, l);

	t->L   = l;
	t->L2  = _mm_slli_pi16(l, 1);
	t->NL  = nl;
	t->NL2 = _mm_slli_pi16(nl, 1);
}

ALIGN(0x10) static const UINT16 UR_4[4] = { 4, 4, 4, 4 };

/* */

#if 0
static void Filter_LoopFilterH(
	const LoopFilter_t* t,
	UINT8*              b,
	INT32               s)
{
	const INT16* d = t->Delta + 127;

	INT32 p0[2];
	INT32 p1[2];

	INT32 q0[2];
	INT32 q1[2];

	UINT8* p   = b;
	UINT8* end = p + s * 8;

	p0[1] = 0;
	p1[1] = 0;
	q0[1] = 255;
	q1[1] = 255;

	for (; p < end; p += s) {
		INT32 x = (p[-2] - p[1]) + 3 * (p[0] - p[-1]);
		INT32 v = d[(x + 4) >> 3];

		p0[0] = p[-1] + v;
		p1[0] = p[ 0] - v;

		q0[0] = p0[(p0[0] < 0)];
		q1[0] = p1[(p1[0] < 0)];

		p[-1] = q0[(q0[0] > 255)];
		p[ 0] = q1[(q1[0] > 255)];
	}
}
#endif

static __inline void Filter_LoopFilterH_MMX(
	const MFilter_t* t,
	UINT8*           b,
	INT32            s)
{
	UINT8* p = (UINT8*)b;
	UINT8* e = p + s * 8;

	const __m64 z = _mm_setzero_si64();

	for (; p < e; p += 4 * s) {
		__m64 S0 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 0 * s))), z);
		__m64 S1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 1 * s))), z);
		__m64 S2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 2 * s))), z);
		__m64 S3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 + 3 * s))), z);

		__m64 u0 = _mm_unpacklo_pi16(S0, S1);
		__m64 u1 = _mm_unpackhi_pi16(S0, S1);
		__m64 u2 = _mm_unpacklo_pi16(S2, S3);
		__m64 u3 = _mm_unpackhi_pi16(S2, S3);

		__m64 P0 = _mm_unpackhi_pi32(u0, u2);
		__m64 P1 = _mm_unpacklo_pi32(u1, u3);

		__m64 X = _mm_sub_pi16(_mm_unpacklo_pi32(u0, u2), _mm_unpackhi_pi32(u1, u3));
		__m64 Y = _mm_sub_pi16(P1, P0);
		__m64 R = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(_mm_add_pi16(X, Y), _mm_slli_pi16(Y, 1)), *((__m64*)UR_4)), 3);

		__m64 m1 = _mm_cmpgt_pi16(R,     t->L);
		__m64 m2 = _mm_cmpgt_pi16(t->NL, R   );

		__m64 r, D;

		r = _mm_or_si64(_mm_andnot_si64(m1, R), _mm_and_si64(_mm_sub_pi16(t->L2,  R), m1));
		r = _mm_or_si64(_mm_andnot_si64(m2, r), _mm_and_si64(_mm_sub_pi16(t->NL2, R), m2));

		r = _mm_andnot_si64(_mm_cmpgt_pi16(R,      t->L2), r);
		r = _mm_andnot_si64(_mm_cmpgt_pi16(t->NL2, R    ), r);

		P0 = _mm_add_pi16(P0, r);
		P1 = _mm_sub_pi16(P1, r);

		D = _mm_unpacklo_pi8(_mm_packs_pu16(P0, P0), _mm_packs_pu16(P1, P1));

		{
			UINT32 d0 = _mm_cvtsi64_si32(D);
			UINT32 d1 = _mm_cvtsi64_si32(_mm_unpackhi_pi32(D, D));

			*((UINT16*)(p - 1 + 0 * s)) = (UINT16)d0;
			*((UINT16*)(p - 1 + 1 * s)) =         d0 >> 16;
			*((UINT16*)(p - 1 + 2 * s)) = (UINT16)d1;
			*((UINT16*)(p - 1 + 3 * s)) =         d1 >> 16;
		}
	}
}

/* */

#if 0
static void Filter_LoopFilterV(
	const LoopFilter_t* t,
	UINT8*              b,
	INT32               s)
{
	const INT16* d = t->Delta + 127;

	INT32 p0[2];
	INT32 p1[2];

	INT32 q0[2];
	INT32 q1[2];

	UINT8* p   = b;
	UINT8* end = p + 8;

	p0[1] = 0;
	p1[1] = 0;
	q0[1] = 255;
	q1[1] = 255;

	for (; p < end; p++) {
		INT32 x = (p[-2 * s] - p[1 * s]) + 3 * (p[0] - p[-1 * s]);
		INT32 v = d[(x + 4) >> 3];

		p0[0] = p[-s] + v;
		p1[0] = p[ 0] - v;

		q0[0] = p0[(p0[0] < 0)];
		q1[0] = p1[(p1[0] < 0)];

		p[-s] = q0[(q0[0] > 255)];
		p[ 0] = q1[(q1[0] > 255)];
	}
}
#endif

static __inline void Filter_LoopFilterV_MMX(
	const MFilter_t* t,
	UINT8*           b,
	INT32            s)
{
	UINT8* p = (UINT8*)b;
	UINT8* e = p + 8;

	const __m64 z = _mm_setzero_si64();

	for (; p < e; p += 4) {
		__m64 P0 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - s))), z);
		__m64 P1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p + 0))), z);

		__m64 X = _mm_sub_pi16(
			_mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p - 2 * s))), z),
			_mm_unpacklo_pi8(_mm_cvtsi32_si64(*((UINT32*)(p +     s))), z));
		__m64 Y = _mm_sub_pi16(P1, P0);
		__m64 R = _mm_srai_pi16(_mm_add_pi16(_mm_add_pi16(_mm_add_pi16(X, Y), _mm_slli_pi16(Y, 1)), *((__m64*)UR_4)), 3);

		__m64 m1 = _mm_cmpgt_pi16(R,     t->L);
		__m64 m2 = _mm_cmpgt_pi16(t->NL, R   );

		__m64 r;

		r = _mm_or_si64(_mm_andnot_si64(m1, R), _mm_and_si64(_mm_sub_pi16(t->L2,  R), m1));
		r = _mm_or_si64(_mm_andnot_si64(m2, r), _mm_and_si64(_mm_sub_pi16(t->NL2, R), m2));

		r = _mm_andnot_si64(_mm_cmpgt_pi16(R,      t->L2), r);
		r = _mm_andnot_si64(_mm_cmpgt_pi16(t->NL2, R    ), r);

		P0 = _mm_add_pi16(P0, r);
		P1 = _mm_sub_pi16(P1, r);

		*((UINT32*)(p - s)) = _mm_cvtsi64_si32(_mm_packs_pu16(P0, P0));
		*((UINT32*)(p + 0)) = _mm_cvtsi64_si32(_mm_packs_pu16(P1, P1));
	}
}

/* */

static void FrameLoopFilter_MMX(
	FrameDecoder_t* t)
{
	INT32 i;
	INT32 x, y;

	const INT16* b = t->DC;

	Plane_t* plane = t->Frame[1];

	ALIGN(0x10) MFilter_t mf;

	MFilter_Setup(&mf, t->Filter.Limit);

	for (i = 0; i < 3; i++, plane++) {
		INT32 bx = t->Index->BX[i];
		INT32 by = t->Index->BY[i];

		UINT8* r0 = plane->Plane;

		for (y = 0; y < by; y++, r0 += plane->Pitch * 8) {
			UINT8* r = r0;

			for (x = 0; x < bx; x++, r += 8, b++) {
				if (*b != NOT_CODED) {
					if (x > 0) {
						Filter_LoopFilterH_MMX(&mf, r, plane->Pitch);
					}

					if (y > 0) {
						Filter_LoopFilterV_MMX(&mf, r, plane->Pitch);
					}

					if (x < bx - 1 && b[ 1] == NOT_CODED) {
						Filter_LoopFilterH_MMX(&mf, r + 8, plane->Pitch);
					}

					if (y < by - 1 && b[bx] == NOT_CODED) {
						Filter_LoopFilterV_MMX(&mf, r + 8 * plane->Pitch, plane->Pitch);
					}
				}
			}
		}
	}
}

/* */

void QT_ReconstructFrame_MMX(
	FrameDecoder_t* t)
{
	Reconstruct_YPlane_MMX(t);

	Reconstruct_CPlane_MMX(t);

	if (t->Filter.Limit > 0) {
		FrameLoopFilter_MMX(t);
	}
}

/* */

