/* TransformDecoder.c */
/* 2008/11/07         */

#include "StdAfx.h"

#include "TransformDecoder.h"

/* */

static MemoryPool_t* s_pool;

/*  0,   1,   2,   3,    4,    5,    6,    7 */
/* 64, 128, 256, 512, 1024, 2048, 4096, 8192 */

static TransformDecoder_t* s_TransformDecoder[8];
static WindowingDecoder_t* s_WindowingDecoder[8];

/* */

void TransformDecoder_Init(MemoryPool_t* pool)
{
	s_pool = pool;
}

void TransformDecoder_Uninit(void)
{
}

/* */

static TransformDecoder_t* SetupTransformDecoder(INT32 n);
static WindowingDecoder_t* CreateWindowingDecoder(INT32 n);

/* */

TransformDecoder_t* GetTransformDecoder(INT32 n)
{
	TransformDecoder_t* d = NULL;

	INT32 i;

	for (i = 0; i < 8; i++) {
		INT32 sz = 1 << (6 + i);
		if (n == sz) {
			if (s_TransformDecoder[i] == NULL) {
				s_TransformDecoder[i] = SetupTransformDecoder(n);
			}
			d = s_TransformDecoder[i];
		}
	}

	return d;
}

/* */

WindowingDecoder_t* GetWindowingDecoder(INT32 n)
{
	WindowingDecoder_t* d = NULL;

	INT32 i;

	for (i = 0; i < 8; i++) {
		INT32 sz = 1 << (6 + i);
		if (n == sz) {
			if (s_WindowingDecoder[i] == NULL) {
				s_WindowingDecoder[i] = CreateWindowingDecoder(n);
			}
			d = s_WindowingDecoder[i];
		}
	}

	return d;
}

/* */

struct TransformDecoderImplRef {
	TransformDecoder_t d;
	INT32  n;
	FLOAT* m;
};

static void TransformImplRef(
	TransformDecoder_t* d0,
	const FLOAT*        x,
	FLOAT*              z)
{
	struct TransformDecoderImplRef* d = (struct TransformDecoderImplRef*)d0;

	INT32 i, j;

	INT32 n2   = d->n / 2;
	INT32 mask = d->n * 4 - 1;

	for (i = 0; i < d->n; i++) {
		FLOAT s = 0;

		for (j = 0; j < n2; j++) {
			s += x[j] * d->m[((2 * i + 1 + n2) * (2 * j + 1)) & mask];
		}

		z[i] = s;
	}
}

static BOOL TransformSetupRef(
	struct TransformDecoderImplRef* d,
	INT32                           n)
{
	INT32 i;

	d->d.Transform = TransformImplRef;

	d->n = n;
	d->m = (FLOAT*)MemoryPool_Allocate(s_pool, sizeof(FLOAT) * n * 4);
	if (d->m == NULL) {
		return FALSE;
	}

	for (i = 0; i < n * 4; i++) {
		d->m[i] = (FLOAT)cos(M_PI / 2 * i / n);
	}

	return TRUE;
}

TransformDecoder_t* SetupTransformDecoder(INT32 n)
{
	struct TransformDecoderImplRef* d;

	d = (struct TransformDecoderImplRef*)MemoryPool_Allocate(s_pool, sizeof(struct TransformDecoderImplRef));
	if (d == NULL) {
		return NULL;
	}

	if (!TransformSetupRef(d, n)) {
		return NULL;
	}

	return (TransformDecoder_t*)d;
}

/* */

struct WindowingDecoderImpl {
	WindowingDecoder_t d;
	INT32  n;
	FLOAT* w;
};

static void WindowingImpl(
	WindowingDecoder_t* wd,
	FLOAT*              d,
	const FLOAT*        prv,
	const FLOAT*        cur)
{
	struct WindowingDecoderImpl* w = (struct WindowingDecoderImpl*)wd;

	INT32 n2 = w->n / 2;

	FLOAT* p   = d;
	FLOAT* end = p + n2;

	const FLOAT* s0 = prv;
	const FLOAT* s1 = cur;

	FLOAT* w0 = w->w + n2 - 1;
	FLOAT* w1 = w->w;

	while (p < end) {
		p[0] = s0[0] * w0[ 0] + s1[0] * w1[0];
		p[1] = s0[1] * w0[-1] + s1[1] * w1[1];
		p[2] = s0[2] * w0[-2] + s1[2] * w1[2];
		p[3] = s0[3] * w0[-3] + s1[3] * w1[3];

		p  += 4;
		s0 += 4;
		s1 += 4;
		w0 -= 4;
		w1 += 4;
	}
}

static void WindowingImpl_SSE2(
	WindowingDecoder_t* wd,
	FLOAT*              d,
	const FLOAT*        prv,
	const FLOAT*        cur)
{
	struct WindowingDecoderImpl* w = (struct WindowingDecoderImpl*)wd;

	INT32 n2 = w->n / 2;

	FLOAT* p   = d;
	FLOAT* end = p + n2;

	const FLOAT* s0 = prv;
	const FLOAT* s1 = cur;

	FLOAT* w0 = w->w + n2 - 4;
	FLOAT* w1 = w->w;

	while (p < end) {
		__m128 p00;
		__m128 s00 = _mm_load_ps(s0);
		__m128 w00 = _mm_load_ps(w0);
		__m128 s10 = _mm_load_ps(s1);
		__m128 w10 = _mm_load_ps(w1);

		w00 = _mm_shuffle_ps(w00, w00, _MM_SHUFFLE(0, 1, 2, 3));

		p00 = _mm_add_ps(
			_mm_mul_ps(s00, w00),
			_mm_mul_ps(s10, w10));

		_mm_store_ps(p, p00);

		p  += 4;
		s0 += 4;
		s1 += 4;
		w0 -= 4;
		w1 += 4;
	}
}

static BOOL WindowingSetup(
	struct WindowingDecoderImpl* d,
	INT32                        n)
{
	INT32 i;
	INT32 n2 = n / 2;

	extern BOOL g_Enable_SSE2;
	if (g_Enable_SSE2) {
		d->d.Windowing = WindowingImpl_SSE2;
	} else {
		d->d.Windowing = WindowingImpl;
	}

	d->n = n;
	d->w = (FLOAT*)MemoryPool_Allocate(s_pool, sizeof(FLOAT) * n2);
	if (d->w == NULL) {
		return FALSE;
	}

	for (i = 0; i < n2; i++) {
		DOUBLE t = sin((i + 0.5) / n2 * 0.5 * M_PI);
		d->w[i] = (FLOAT)sin(0.5 * M_PI * t * t);
	}

	return TRUE;
}

WindowingDecoder_t* CreateWindowingDecoder(INT32 n)
{
	struct WindowingDecoderImpl* d;

	d = (struct WindowingDecoderImpl*)MemoryPool_Allocate(s_pool, sizeof(struct WindowingDecoderImpl));
	if (d == NULL) {
		return NULL;
	}

	if (!WindowingSetup(d, n)) {
		return NULL;
	}

	return (WindowingDecoder_t*)d;
}

/* */

#include "VorbisDecoder.h"

#include "FrameConverter.h"

INT32 ConvertFrame_LE16(
	INT32              channels,
	const QV_Output_t* output,
	INT16*             samples)
{
	INT16* p = samples;

	switch (channels) {
	case 1:
	{
		const FLOAT* s0 = output->Output[0];
		const FLOAT* e0 = s0 + output->Length;

		while (s0 < e0) {
			INT32 x0 = (INT32)(*s0 * 0x8000);

			if (x0 < -0x8000) {
				x0 = -0x8000;
			} else if (x0 > 0x7fff) {
				x0 = 0x7fff;
			}

			p[0] = (INT16)x0;

			p  += 1;
			s0 += 1;
		}

		break;
	}

	case 2:
	{
		const FLOAT* s0 = output->Output[0];
		const FLOAT* s1 = output->Output[1];
		const FLOAT* e0 = s0 + output->Length;

		while (s0 < e0) {
			INT32 x0, x1;

			x0 = (INT32)(*s0 * 0x8000);
			x1 = (INT32)(*s1 * 0x8000);

			if (x0 < -0x8000) {
				x0 = -0x8000;
			} else if (x0 > 0x7fff) {
				x0 = 0x7fff;
			}

			if (x1 < -0x8000) {
				x1 = -0x8000;
			} else if (x1 > 0x7fff) {
				x1 = 0x7fff;
			}

			p[0] = (INT16)x0;
			p[1] = (INT16)x1;

			p += 2;

			s0 += 1;
			s1 += 1;
		}

		break;
	}

	default:
		return -1;
	}

	return (INT32)(p - samples);
}

/* */

INT32 ConvertFrame_LE16_SSE2(
	INT32              channels,
	const QV_Output_t* output,
	INT16*             samples)
{
	INT16* p = samples;

	switch (channels) {
	case 1:
	{
		const FLOAT* s0 = output->Output[0];
		const FLOAT* e0 = s0 + output->Length;

		__m128 M = _mm_set_ps1((FLOAT)0x8000);

		while (s0 < e0) {
			__m128 s00 = _mm_mul_ps(_mm_load_ps(s0), M);

			__m128i p00 = _mm_cvtps_epi32(s00);

			_mm_storel_epi64((__m128i*)p, _mm_packs_epi32(p00, p00));

			p  += 4;
			s0 += 4;
		}

		break;
	}

	case 2:
	{
		const FLOAT* s0 = output->Output[0];
		const FLOAT* s1 = output->Output[1];
		const FLOAT* e0 = s0 + output->Length;

		__m128 M = _mm_set_ps1((FLOAT)0x8000);

		while (s0 < e0) {
			__m128 s00 = _mm_mul_ps(_mm_load_ps(s0), M);
			__m128 s10 = _mm_mul_ps(_mm_load_ps(s1), M);

			__m128i p00 = _mm_cvtps_epi32(s00);
			__m128i p10 = _mm_cvtps_epi32(s10);

			p00 = _mm_packs_epi32(p00, p00);
			p10 = _mm_packs_epi32(p10, p10);

			_mm_store_si128((__m128i*)p, _mm_unpacklo_epi16(p00, p10));

			p += 8;

			s0 += 4;
			s1 += 4;
		}

		break;
	}

	default:
		return -1;
	}

	return (INT32)(p - samples);
}

/* */

