//----------------------------------------------------------------------
//
//			File:			"charEncoding.cpp"
//			Created:		03-Mar-2011
//			Author:			ÓcLG
//			Description:	R[h֐
//
//----------------------------------------------------------------------

/*

	Copyright (C) 2012 by Nobuhide Tsuda

	RuviEdit ̃CZX MIT{GPL ȃCZXłB 
	ۏ؁ET|[głAŗpłApAvł\[XR[h𗬗p邱Ƃ\łB 
	i\[XR[h𗬗pꍇAp̒쌠ECZXRuviEdit̂̂܂܂łj 
	M҂́AvO}ɂƂĕsRɂ܂Ȃ̂ɎRRƌGPLnȂ̂ŁA 
	RuviEdit ̃\[XGPLnvWFNgŎgp邱Ƃ֎~܂B 
	GPLvWFNgł͈؂̗p֎~܂ALGPLvWFNgł͓INɂ闬p͋܂B

*/


#include <QtCore>
#include <QMessageBox>
#include "charEncoding.h"

#define		EUC_KANA_LEADBYTE		((uchar)0x8e)

bool const DBCStable1[0x100] = {		//	cãt@[XgoCgǂ
/* 0 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 1 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 2 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 3 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 4 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 5 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 6 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 7 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 8 */	0, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 9 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* a */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* b */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* c */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* d */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* e */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* f */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 0, 0,
};

bool const DBCStable2[0x100] = {		//	cãZJhoCgǂ
/* 0 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 1 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 2 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 3 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 4 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 5 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 6 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 7 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 0,
/* 8 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 9 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* a */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* b */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* c */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* d */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* e */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* f */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 0, 0,
};

uchar const DBCSsizeTable[0x100] = {
/* 0 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 1 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 2 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 3 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 4 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 5 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 6 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 7 */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 8 */	1, 2, 2, 2, 2, 2, 2, 2,  2, 2, 2, 2, 2, 2, 2, 2,
/* 9 */	2, 2, 2, 2, 2, 2, 2, 2,  2, 2, 2, 2, 2, 2, 2, 2,
/* a */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* b */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* c */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* d */	1, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* e */	2, 2, 2, 2, 2, 2, 2, 2,  2, 2, 2, 2, 2, 2, 2, 2,
/* f */	2, 2, 2, 2, 2, 2, 2, 2,  2, 2, 2, 2, 2, 2, 1, 1,
};

bool isDBCSLeadByte(uchar ch)
{
	return DBCStable1[ch];
}
bool isDBCSSecondByte(uchar ch)
{
	return DBCStable2[ch];
}

uchar	UTF8_BOM[UTF8_BOM_LENGTH] = {0xef, 0xbb, 0xbf};
uchar	UTF16LE_BOM[UTF16_BOM_LENGTH] = {0xff, 0xfe};
uchar	UTF16BE_BOM[UTF16_BOM_LENGTH] = {0xfe, 0xff};

inline bool isUTF16leBom(cuchar *ptr, cuchar *endptr)
{
	return ptr + 1 < endptr && ptr[0] == UTF16LE_BOM[0] && ptr[1] == UTF16LE_BOM[1];
	//return ptr + 1 < endptr && ptr[0] == 0xff && ptr[1] == 0xfe;
}

inline bool isUTF16beBom(cuchar *ptr, cuchar *endptr)
{
	return ptr + 1 < endptr && ptr[0] == UTF16BE_BOM[0] && ptr[1] == UTF16BE_BOM[1];
	//return ptr + 1 < endptr && ptr[0] == 0xfe && ptr[1] == 0xff;
}

inline bool isUTF8Bom(cuchar *ptr, cuchar *endptr)
{
	return ptr + 2 < endptr && ptr[0] == UTF8_BOM[0] &&
			ptr[1] == UTF8_BOM[1] && ptr[2] == UTF8_BOM[2];
	//return ptr + 2 < endptr && ptr[0] == 0xef && ptr[1] == 0xbb && ptr[2] == 0xbf;
}

uchar checkCharEncoding(cuchar *ptr, cuchar *endptr, int &BOMLength)
{
	if( isUTF8Bom(ptr, endptr) ) {
		BOMLength = 3;
		return CharEncoding::UTF8;
	}
	if( isUTF16leBom(ptr, endptr) ) {
		BOMLength = 2;
		return CharEncoding::UTF16_LE;
	}
	if( isUTF16beBom(ptr, endptr) ) {
		BOMLength = 2;
		return CharEncoding::UTF16_BE;
	}
	BOMLength = 0;
	cuchar *ptr0 = ptr;

	bool	psEUC = true;		//	EUC ̉\
	bool	psSJIS = true;		//	SJIS ̉\
	bool	psUTF8 = true;		//	UTF-8̉\
	bool	psUTF16LE = true;		//	UTF-16̉\
	bool	psUTF16BE = true;		//	UTF-16̉\
	//bool	EUCKana = false;	//	EUC_KANA_LEADBYTE ꍇ
	bool	dbSJIS = false;		//	SJIS_uoCg
	//bool	hanKanaSJIS = false;	//	SJIS pJiꍇ
	bool	ansi = false;		//	ANSI ꍇ
	int		nthEUC = 0;			//	ڂ
	int		nthSJIS = 0;
	int		nthUTF8 = 0;
	int		nextNthUTF8 = 0;	//	3oCg UTF-8 ǂi2:3 byte, 0:2bytej
	int	unixRetCount = 0;	//	0x0a ł̉s̐
	int dosRetCount = 0;	//	0x0d 0x0a ł̉s̐
	int zeroCount[] = {0, 0};
	int	nByteSJIS = 0;		//	SJIS Ƃĉ߉\2oCg̃oCg
	int	nByteEUC = 0;		//	EUC Ƃĉ߉\2oCg̃oCg
	int	nByteUTF8 = 0;		//	UTF8 Ƃĉ߉\2oCgȏ㕶̃oCg

	if( endptr - ptr > 0x10000 )
		endptr = ptr + 0x10000;		//	ő64L
	if( endptr >= ptr + 2 )
		endptr -= 2;	//	ptr + 1, ptr + 2 ܂ŃANZX\邽
	uchar uch;
	while( ptr < endptr ) {
		if( (uch = (uchar)*ptr++) >= 0x80 ) {
			if( psSJIS ) {			//	SJIS ̉\cĂ
				if( nthSJIS == 0 ) {		//	ŏ̃oCg
					if( isDBCSLeadByte(uch) && isDBCSSecondByte((uchar)*ptr) ) {
						dbSJIS = true;
						nthSJIS = 1;
						nByteSJIS += 2;
					} else if( !isHankakuKana(uch) ) {
						psSJIS = false;
						if( !psUTF16LE && !psUTF16BE ) {
							if( !psUTF8 )
								return CharEncoding::EUC;
							if( !psEUC )
								return CharEncoding::UTF8;
						}
					}
				} else
					nthSJIS = 0;
			}
			if( psEUC ) {			//	EUC ̉\cĂ
				if( nthEUC == 0 ) {
					if( isEUCcode(uch) && isEUCcode((uchar)*ptr) &&
						!(uch == EUC_KANA_LEADBYTE && !isHankakuKana((uchar)*ptr)) )
					{
						nthEUC = 1;
						nByteEUC += 2;
					} else {
						psEUC = false;
						if( !psUTF16LE && !psUTF16BE ) {
							if( !psUTF8 )
								return CharEncoding::SJIS;
							if( !psSJIS )
								return CharEncoding::UTF8;
						}
					}
				} else
					nthEUC = 0;
			}
			if( psUTF8 ) {			//	UTF-8 ̉\cĂ
				switch( nthUTF8 ) {
				case 0:
					if( (uch & 0xe0) == 0xc0 ) {	//	2oCgR[h1
						if( ((uchar)*ptr & 0xc0) == 0x80 ) {
							nextNthUTF8 = 0;
							nthUTF8 = 1;
							nByteUTF8 += 2;
						} else
							psUTF8 = false;
					} else if( (uch & 0xf0) == 0xe0 ) {	//	3oCgR[h1
						if( ((uchar)*ptr & 0xc0) == 0x80 && ((uchar)ptr[1] & 0xc0) == 0x80) {
							nextNthUTF8 = 2;
							nthUTF8 = 1;
							nByteUTF8 += 3;
						} else
							psUTF8 = false;
					} else
						psUTF8 = false;
					break;
				case 1:
					nthUTF8 = nextNthUTF8;		//	1 or 3
					break;
				case 2:
					nthUTF8 = 0;
					break;
				}
				if( !psUTF8 && !psUTF16LE && !psUTF16BE ) {
					if( !psEUC )
						return CharEncoding::SJIS;
					if( !psSJIS )
						return CharEncoding::EUC;
				}
			}
		} else {	//	uch < 0x80
			nthSJIS = nthEUC = 0;
			if( uch == 0 ) {
				zeroCount[(ptr - ptr0 - 1)&1] += 1;
				if(  ptr[1] == 0 && ptr[3] == 0 && ptr[5] == 0 ) {
					//	0x00 ЂƂɑꍇ UNICODE ƂƂɂĂ	//##
					return zeroCount[0] <= zeroCount[1] ? CharEncoding::UTF16_LE : CharEncoding::UTF16_BE;
				}
			}
			if( uch >= ' ' && uch < 0x7f )
				ansi = true;
			if( uch < ' ' ) {
				//	UTF-16LE Ȃ͎̕ 0x00 ̂͂
				if( ptr < endptr && *ptr != '\0' )
					psUTF16LE = false;
				//	UTF-16LE Ȃ͑O̕ 0x00 ̂͂
				if( ptr - 2 >= ptr0 && ptr[-2] != '\0' )
					psUTF16LE = false;
			}
		}
		switch( uch ) {
#if 0
		//	JIS ͓ʃT|[gȂ
		case 0x1b:
			if( ptr + 1 < endptr &&
				(ptr[0] == '(' && (ptr[1] == 'B' || ptr[1] == 'J') ||
				ptr[0] == '$' && (ptr[1] == 'B' || ptr[1] == '@')) )
			{
				return CharEncoding::JIS;
			}
			break;
#endif
		case 0x0a:
			unixRetCount += 1;
			break;
		case 0x0d:
			if( ptr < endptr && *ptr == 0x0a ) {
				ptr += 1;
				dosRetCount += 1;
			}
		}
	}
	if( psSJIS && !psEUC && !psUTF8 )
		return CharEncoding::SJIS;
	if( !psSJIS && psEUC && !psUTF8 )
		return CharEncoding::EUC;
	if( !psSJIS && !psEUC && psUTF8 )
		return CharEncoding::UTF8;
	if( psSJIS && nByteSJIS > nByteEUC && nByteSJIS > nByteUTF8 )
		return CharEncoding::SJIS;
	if( psEUC && nByteEUC > nByteSJIS && nByteEUC > nByteUTF8 )
		return CharEncoding::EUC;
	if( psUTF8 && nByteUTF8 > nByteSJIS && nByteUTF8 > nByteEUC )
		return CharEncoding::UTF8;
	//if( !psUTF8 )
	//	return dosRetCount >= unixRetCount && dbSJIS ? CharEncoding::SJIS : CharEncoding::EUC;
	if( !psSJIS ) {
		if( psEUC && !psUTF8 ) return CharEncoding::EUC;
		if( !psEUC && psUTF8 ) return CharEncoding::UTF8;
	}

	if( psSJIS && dbSJIS && !psEUC ) return CharEncoding::SJIS;			//	_uoCg܂łꍇ
#if 0
	if( stgMgr->getGlobSettings()->getBoolValue(GLOBSTG_ASCII_LF_SJIS) )
		return CharEncoding::SJIS;
#endif
	return dosRetCount >= unixRetCount ? CharEncoding::SJIS : CharEncoding::EUC;
}
cchar *codecName(uchar enc)
{
	switch( enc ) {
	case CharEncoding::UTF8:
		return "UTF-8";
	case CharEncoding::UTF16_LE:
		return "UTF-16LE";
	case CharEncoding::UTF16_BE:
		return "UTF-16BE";
	case CharEncoding::EUC:
		return "EUC-JP";
	case CharEncoding::SJIS:
		return "Shift-JIS";
	}
	return 0;
}
#if 0
bool loadFile(const QString &fileName, QString &buffer, QString &errorString,
				uchar *encPtr, bool *wbPtr)
{
	QDir path(fileName);
	QFile file(path.path());
	if( !file.open(QFile::ReadOnly /*| QFile::Text*/) ) {
		errorString = file.errorString();
		buffer.clear();
		return false;
	}
	QByteArray ba = file.readAll();
	cuchar *ptr = (uchar *)(ba.data());
	//cuchar *ptr = static_cast<uchar *>(ba.data());
	cuchar *endptr = ptr + ba.size();
	int BOMLength;
	uchar enc = checkCharEncoding(ptr, endptr, BOMLength);
	cchar *codecName = codecName(enc);
	QTextCodec *codec = codecName ? QTextCodec::codecForName(codecName) : 0;
	if( codec == 0 )
		codec = QTextCodec::codecForLocale();
	if( codec == 0 ) {
		QMessageBox::warning(0, "ViVi 5.0",
			QObject::tr("No QTextCodec for %1.")
							 .arg(QString(codecName ? codecName : "Locale")));
		return false;
	}
	buffer = codec->toUnicode(ba);
	if( encPtr != 0 ) *encPtr = enc;
	if( wbPtr != 0 ) *wbPtr = BOMLength != 0;
	return true;
}
#endif
