/* SFMTcc.c */

#include <windows.h>
#include <malloc.h>
#include <emmintrin.h>

#include "SFMTcc.h"

#define ALIGN(X) _declspec(align(X))
#define INLINE   __inline

/* */

static VOID* mt_malloc(SIZE_T cb)
{
	return _aligned_malloc(cb, 0x10);
}

static VOID mt_free(VOID* p)
{
	_aligned_free(p);
}

/* */

/*
"SFMT-19937:122-18-1-11-1:dfffffef-ddfecb7f-bffaffff-bffffff6"
*/

#define MEXP	19937

#define POS1	122

#define SL1		18
#define SL2		1
#define SR1		11
#define SR2		1

#define MSK1	0xdfffffefU
#define MSK2	0xddfecb7fU
#define MSK3	0xbffaffffU
#define MSK4	0xbffffff6U

#define PARITY1	0x00000001U
#define PARITY2	0x00000000U
#define PARITY3	0x00000000U
#define PARITY4	0x13c9e684U

#define N	(MEXP / 128 + 1)
#define N32	(N * 4)
#define N64	(N * 2)

/* */

static void mt_initialize(UINT32* v, UINT32 seed)
{
	UINT32* p = v;
	UINT32* e = v + N32;

	for (*(p++) = seed; p < e; p++) {
		*p = 1812433253U * (p[-1] ^ (p[-1] >> 30)) + (UINT32)(p - v);
	}
}

static INLINE UINT32 mt_func1(UINT32 x)
{
	return (x ^ (x >> 27)) * 1664525U;
}

static INLINE UINT32 mt_func2(UINT32 x)
{
	return (x ^ (x >> 27)) * 1566083941U;
}

static void mt_initialize_array(UINT32* v, const UINT32* init_key, INT32 key_length)
{
	const INT32 lag = 11;
	const INT32 mid = (N32 - lag) / 2;

	INT32 i, j;
	INT32 count = N32;

	UINT32 r;

	memset(v, 0x8b, sizeof(UINT32) * N32);

	if (count < key_length + 1) {
		count = key_length + 1;
	}

	r = mt_func1(v[0] ^ v[mid] ^ v[N32 - 1]);
	v[mid] += r;
	r += key_length;
	v[mid + lag] += r;
	v[0]  = r;

	count--;

	for (i = 1, j = 0; j < count && j < key_length; j++, i = (i + 1) % N32) {
		r = mt_func1(v[i] ^ v[(i + mid) % N32] ^ v[(i + N32 - 1) % N32]);
		v[(i + mid) % N32] += r;
		r += init_key[j] + i;
		v[(i + mid + lag) % N32] += r;
		v[i] = r;
	}

	for (; j < count; j++, i = (i + 1) % N32) {
		r = mt_func1(v[i] ^ v[(i + mid) % N32] ^ v[(i + N32 - 1) % N32]);
		v[(i + mid) % N32] += r;
		r += i;
		v[(i + mid + lag) % N32] += r;
		v[i] = r;
	}

	for (j = 0; j < N32; j++, i = (i + 1) % N32) {
		r = mt_func2(v[i] + v[(i + mid) % N32] + v[(i + N32 - 1) % N32]);
		v[(i + mid) % N32] ^= r;
		r -= i;
		v[(i + mid + lag) % N32] ^= r;
		v[i] = r;
	}
}

static void mt_period_certification(UINT32* v)
{
	static const UINT32 parity[4] = { PARITY1, PARITY2, PARITY3, PARITY4 };

	UINT32 inner = 0;

	inner ^= (v[0] & parity[0]);
	inner ^= (v[1] & parity[1]);
	inner ^= (v[2] & parity[2]);
	inner ^= (v[3] & parity[3]);

	inner ^= (inner >> 16);
	inner ^= (inner >>  8);
	inner ^= (inner >>  4);
	inner ^= (inner >>  2);
	inner ^= (inner >>  1);

	if ((inner & 1) != 1) {
		INT32 i, j;
		UINT32 w;

		for (i = 0; i < 4; i++) {
			for (j = 0, w = 1; j < 32; j++, w <<= 1) {
				if ((w & parity[i]) != 0) {
					v[i] ^= w;
					return;
				}
			}
		}
	}
}

/* */

static INLINE void mt_recursion_C(UINT32* p, const UINT32* q, const UINT32* c, const UINT32* d)
{
	UINT64 lo, hi;
	UINT64 l0, h0;
	UINT64 l1, h1;

	lo = (UINT64)(p[0]) | ((UINT64)(p[1]) << 32);
	hi = (UINT64)(p[2]) | ((UINT64)(p[3]) << 32);

	l0 =  lo << (SL2 * 8);
	h0 = (hi << (SL2 * 8)) | (lo >> (64 - SL2 * 8));

	lo = (UINT64)(c[0]) | ((UINT64)(c[1]) << 32);
	hi = (UINT64)(c[2]) | ((UINT64)(c[3]) << 32);

	l1 = (lo >> (SR2 * 8)) | (hi << (64 - SR2 * 8));
	h1 =  hi >> (SR2 * 8);

	p[0] ^= (((UINT32) l0       ) ^ ((q[0] >> SR1) & MSK1) ^ ((UINT32) l1       ) ^ (d[0] << SL1));
	p[1] ^= (((UINT32)(l0 >> 32)) ^ ((q[1] >> SR1) & MSK2) ^ ((UINT32)(l1 >> 32)) ^ (d[1] << SL1));
	p[2] ^= (((UINT32) h0       ) ^ ((q[2] >> SR1) & MSK3) ^ ((UINT32) h1       ) ^ (d[2] << SL1));
	p[3] ^= (((UINT32)(h0 >> 32)) ^ ((q[3] >> SR1) & MSK4) ^ ((UINT32)(h1 >> 32)) ^ (d[3] << SL1));
}

static void mt_generate_C(UINT32* v)
{
	      UINT32* p = v;
	const UINT32* q = v + POS1 * 4;

	const UINT32* e0 = v + (N - POS1) * 4;
	const UINT32* e1 = v + N * 4;

	const UINT32* c = v + (N - 2) * 4;
	const UINT32* d = v + (N - 1) * 4;

	for (; p < e0; p += 8, q += 8) {
		mt_recursion_C(p + 0, q + 0, c, d);
		c = d;
		d = p + 0;
		mt_recursion_C(p + 4, q + 4, c, d);
		c = d;
		d = p + 4;
	}

	q = v;

	for (; p < e1; p += 8, q += 8) {
		mt_recursion_C(p + 0, q + 0, c, d);
		c = d;
		d = p + 0;
		mt_recursion_C(p + 4, q + 4, c, d);
		c = d;
		d = p + 4;
	}
}

/* */

static INLINE __m128i mt_recursion_SSE2(__m128i* p, const __m128i* q, __m128i c, __m128i d, __m128i mask)
{
	__m128i r;
	__m128i a = _mm_load_si128(p);

	r = _mm_xor_si128(a, _mm_slli_si128(a, SL2));
	r = _mm_xor_si128(r, _mm_and_si128(_mm_srli_epi32(_mm_load_si128(q), SR1), mask));
	r = _mm_xor_si128(r, _mm_srli_si128(c, SR2));
	r = _mm_xor_si128(r, _mm_slli_epi32(d, SL1));

	_mm_store_si128(p, r);

	return r;
}

static void mt_generate_SSE2(__m128i* v)
{
	ALIGN(0x10) static const UINT32 MASK[] = { MSK1, MSK2, MSK3, MSK4 };

	const __m128i mask = _mm_load_si128((const __m128i*)MASK);

	      __m128i* p = v;
	const __m128i* q = v + POS1;

	const __m128i* e0 = v + N - POS1;
	const __m128i* e1 = v + N;

	__m128i c, d, r;

	c = _mm_load_si128(e1 - 2);
	d = _mm_load_si128(e1 - 1);

	for (; p < e0; p += 2, q += 2) {
		r = mt_recursion_SSE2(p + 0, q + 0, c, d, mask);
		c = d;
		d = r;
		r = mt_recursion_SSE2(p + 1, q + 1, c, d, mask);
		c = d;
		d = r;
	}

	q = v;

	for (; p < e1; p += 2, q += 2) {
		r = mt_recursion_SSE2(p + 0, q + 0, c, d, mask);
		c = d;
		d = r;
		r = mt_recursion_SSE2(p + 1, q + 1, c, d, mask);
		c = d;
		d = r;
	}
}

/* */

static BOOL g_SFMT_Enable_SSE2 = FALSE;

BOOL SFMT_Initialize(void)
{
	INT32 info[4] = { 0 };

	__cpuid(info, 1);

	if ((info[3] & (1 << 26)) != 0) {
		g_SFMT_Enable_SSE2 = TRUE;
	}

	return TRUE;
}

BOOL SFMT_SetEnableSSE2(BOOL b)
{
	BOOL bb = g_SFMT_Enable_SSE2;

	g_SFMT_Enable_SSE2 = b;

	return bb;
}

/* */

struct SFMTContext {

	UINT32* vector;

	INT32 size;

	INT32 index;

	SFMT_gen_rand32_t gen_rand32;

};

/* */

static UINT32 mt_gen_rand32_SSE2(SFMTContext_t* t)
{
	if (t->index >= t->size) {
		mt_generate_SSE2((__m128i*)(t->vector));
		t->index = 0;
	}

	return t->vector[(t->index)++];
}

static UINT32 mt_gen_rand32_C(SFMTContext_t* t)
{
	if (t->index >= t->size) {
		mt_generate_C(t->vector);
		t->index = 0;
	}

	return t->vector[(t->index)++];
}

/* */

SFMTContext_t* SFMT_CreateContext(void)
{
	SFMTContext_t* t = (SFMTContext_t*)mt_malloc(sizeof(SFMTContext_t) + sizeof(UINT32) * N32);
	if (t == NULL) {
		return NULL;
	}

	t->vector     = (UINT32*)(t + 1);
	t->size       = N32;
	t->index      = 0;

	if (g_SFMT_Enable_SSE2) {
		t->gen_rand32 = mt_gen_rand32_SSE2;

	} else {
		t->gen_rand32 = mt_gen_rand32_C;
	}

	return t;
}

void SFMT_ReleaseContext(SFMTContext_t* t)
{
	mt_free(t);
}

void SFMT_InitializeContext(
	SFMTContext_t* t,
	UINT32         seed)
{
	UINT32* v = (UINT32*)(t->vector);

	mt_initialize(v, seed);
	mt_period_certification(v);

	t->index = t->size;
}

void SFMT_InitializeContextByArray(
	SFMTContext_t* t,
	const UINT32*  key,
	INT32          len)
{
	UINT32* v = (UINT32*)(t->vector);

	mt_initialize_array(v, key, len);
	mt_period_certification(v);

	t->index = t->size;
}

/* */

SFMT_gen_rand32_t SFMT_Get_gen_rand32(
	SFMTContext_t* t)
{
	return t->gen_rand32;
}

/* */

