/*

	Copyright (C) 2012 by Nobuhide Tsuda

	RuviEdit ̃CZX MIT{GPL ȃCZXłB 
	ۏ؁ET|[głAŗpłApAvł\[XR[h𗬗p邱Ƃ\łB 
	i\[XR[h𗬗pꍇAp̒쌠ECZXRuviEdit̂̂܂܂łj 
	M҂́AvO}ɂƂĕsRɂ܂Ȃ̂ɎRRƌGPLnȂ̂ŁA 
	RuviEdit ̃\[XGPLnvWFNgŎgp邱Ƃ֎~܂B 
	GPLvWFNgł͈؂̗p֎~܂ALGPLvWFNgł͓INɂ闬p͋܂B

*/
#include "ViewTokenizer.h"
#include "EditView.h"

/**

===
	"		_uNH[gFAnextString('"') R[Am_quote = '"'
	a
	"		nextString: m_quote _Ń^[
===
	"		_uNH[gFAnextString('"') R[Am_quote = '"'
	a
	#
	{		nextString: #{ F_Ń^[AinBrace tOON
	exp		nextToken() ŕʂɏ
	}		inBrace tOON̏Ԃ } 𔭌ꍇ nextString(m_quote) R[
			inBrase tOOFF
	b
	"		nextString: m_quote _Ń^[
===
	"		_uNH[gFAnextString('"') R[Am_quote = '"'
	a
	s	obt@Ń^[Atoken ̍Ō̕ m_quote ǂŔiHj
			s_ŁAhighlightBlock() 烊^[̂ŁA
			tn oRŏԂێ邱Ƃ͏oȂ
				 blockState ɂ̏
	b
	"		nextString: m_quote _Ń^[
===
	"		nest = 1, quoteStack = Empty
	a
	(		nest = 2, quoteStack = Empty
	#
	{		quoteStack = ", 2
	'		quoteStack = ", 2
	str
	'		quoteStack = ", 2
	}		nest = 2, quoteStack = Empty
	)		nest = 1, quoteStack = Empty
	"

*/

bool isLetterOrNumberOrUnderbar(const QChar &ch);
bool asciiSymbolTable[] = {
/* 0 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 1 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 2 */	0, 1, 1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1, 1,
/* 3 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 1, 1, 1, 1, 1, 1,
/* 4 */	1, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 5 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 1, 1, 1, 1, 1,	//	_ (0x5f) ̓V{Ƃ݂Ȃ
/* 6 */	1, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 7 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 1, 1, 1, 1, 0,
/* 8 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* 9 */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* a */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* b */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* c */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* d */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* e */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
/* f */	0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0,
};
bool isAsciiSymbol(QChar qch)
{
	ushort code = qch.unicode();
	return code < 0x80 && asciiSymbolTable[(uchar)code];
}
bool isHexChar(QChar qch)
{
	return (qch >= '0' && qch <= '9')
			|| (qch >= 'a' && qch <= 'f')
			|| (qch >= 'A' && qch <= 'F');
}
bool isJustBeforeRegexp(const QString &prev, const QString &prev2)
{
	if( prev == "]" || prev == ")" || prev == "." )
		return false;
	if( prev.isEmpty() || isAsciiSymbol(prev[0]) )
		return true;
	if( prev2 == "." && prev[0].isLetter() )	//	.\bh /regexp/ ̏ꍇ
		return true;
	return prev == "p" || prev == "puts" || prev == "print"
			|| prev == "if" || prev == "elsif"
			|| prev == "while" || prev == "until";
}

ViewTokenizer::ViewTokenizer(const QString &buffer, EditView *view)
	: m_buffer(buffer), m_view(view)

{
	//m_inBrace = false;
	m_ix = 0;
	m_tokenType = UNDEF;
	//m_openQuote = '\0';
	//m_nestLevel = 1;
	//nextToken();
}

ViewTokenizer::~ViewTokenizer(void)
{
}
//	nextString()  nextToken()  [1] JnNH[gނ𔭌ꍇA܂
//	[2] }`C ܂ qAhLgԂ̂ƂARubySyntaxHighliter::highlightBlock()
//	R[
//	[1] ̏ꍇ́AquoteStack ɐVACeςނ
//	[2] ̏ꍇ́AĂԑOɐς܂ĂiOsŌ̏Ԃ̂܂܎zĵƂ
//	[1]  [2] ̋ʂ quoteLength OǂŔ肷i[2] ̏ꍇ 0j
//	XgO̓^CvEIquoteElXgxX^bNɐςł
//	SQ_STRING ȊO #{ 𔭌ꍇ́A^CvEIquoteElXgxX^bNɃvbV
//	} ɂ茳̃ReLXgɕAꍇ́AX^bNIquoteAlXgx|bv
uchar ViewTokenizer::nextString(char quote, int quoteLength,
								uchar strType,		//	DQ, BQ Ȃ #{ Ń^[~lCg
								char openQuote)		//	([{< ̏ꍇ
{
	//m_view->doOutput2(QString("Enter ViewTokenizer::nextString(strType = %1)\n")
	//					.arg(strType));
	StringItem si;
	if( quoteLength )	//	" ' ` 𔭌ꍇA}`C񂩂̏ꍇ quoteLength = 0 ŌĂ΂͂
		si = StringItem(strType, quote, openQuote, 1);
	else {
		si = m_quoteStack.last();
		m_quoteStack.pop_back();
		strType = si.m_type == HERE_DOCUMENT ? si.m_strType : si.m_type;
	}
#if 0
	if( quoteLength )	//	" ' ` 𔭌ꍇA}`C񂩂̏ꍇ quoteLength = 0 ŌĂ΂͂
		m_quoteStack.push_back(StringItem(strType, quote, openQuote, 1));
	//StringItem &si = !m_quoteStack.isEmpty() ? m_quoteStack.last() : StringItem(0, 0, 0, 0);
	StringItem si = StringItem(0, 0, 0, 0);
	if( !m_quoteStack.isEmpty() ) {
		si = m_quoteStack.last();
		m_quoteStack.pop_back();
	}
#endif
	m_foundExp = false;
	m_mlString = false;
	m_tokenOffset = m_ix;
	m_ix += (m_quoteLength = quoteLength);
	for(;;) {
		if( m_ix >= m_buffer.length() ) {		//	̍sɑꍇ
			m_mlString = true;
			m_tokenCloseQuoteLength = 0;
			m_quoteStack.push_back(si);
			break;
		}
		if( strType != SQ_STRING && m_buffer.mid(m_ix).startsWith("#{") ) {
			m_foundExp = true;
			m_quoteStack.push_back(si);
			m_quoteStack.push_back(StringItem(EXPAND_EXP, '}'));
			m_ix += (m_tokenCloseQuoteLength = 2);
			break;
		}
		QChar c = m_buffer[m_ix++];
		if( c == si.m_closeQuote && !--si.m_nestLevel ) {
			m_tokenCloseQuoteLength = 1;
			break;
		}
		if( si.m_openQuote != '\0' && c == si.m_openQuote )
			++si.m_nestLevel;
		else if( c == '\\' && m_ix + 1 < m_buffer.length() )
			++m_ix;
	}
	m_tokenText = m_buffer.mid(m_tokenOffset, m_ix - m_tokenOffset);
	return m_tokenType = strType;
}
uchar ViewTokenizer::regexp(char quote, int quoteLength,
							char openQuote)		//	([{< ̏ꍇ
{
	//m_quote = quote;
	m_ix += (m_quoteLength = quoteLength);
	int lvl = 1;
	while( m_ix < m_buffer.length() ) {
		QChar qch = m_buffer[m_ix++];
		if( qch == quote && !--lvl ) {
			m_tokenCloseQuoteLength = 1;
			break;
		}
		if( openQuote != '\0' && qch == openQuote )
			++lvl;
		else if( qch == '\\' && m_ix < m_buffer.length() )
			++m_ix;
	}
	m_tokenText = m_buffer.mid(m_tokenOffset, m_ix - m_tokenOffset);
	return m_tokenType = REGEXP;
}
uchar ViewTokenizer::percentSymbol(char quote, int quoteLength)
{
	m_ix += (m_quoteLength = quoteLength);
	while( m_ix < m_buffer.length() && m_buffer[m_ix++] != quote ) {}
	m_tokenText = m_buffer.mid(m_tokenOffset, m_ix - m_tokenOffset);
	return m_tokenType = PERCENT_SYMBOL;
}
void ViewTokenizer::popNestLevelAndQuote()
{
	if( m_quoteStack.isEmpty() ) return;
	//StringItem si = m_quoteStack[m_quoteStack.size() - 1];
	m_quoteStack.pop_back();
	//m_nestLevel = si.m_nestLevel;
	//m_quote = si.m_closeQuote;
}
void ViewTokenizer::skipChar()
{
	if( m_ix < m_buffer.size() )
		++m_ix;
}
uchar ViewTokenizer::nextToken()
{
	if( m_tokenType == END_OF_BUFFER || m_buffer.isEmpty() )
		return m_tokenType = END_OF_BUFFER;
	m_foundExp = false;
	m_tokenCloseQuoteLength = 0;
#if 0
	if( !m_inBrace ) {
		m_nestLevel = 1;
		m_openQuote = '\0';
	}
#endif
	m_prev2Text = m_prevText;
	m_prevText = m_tokenText;
	m_mlString = false;
	while( m_ix < m_buffer.length() && m_buffer[m_ix].isSpace() )
		++m_ix;
	if( m_ix >= m_buffer.length() )
		return m_tokenType = END_OF_BUFFER;
	m_tokenOffset = m_ix;
	if( m_buffer[m_ix].isNumber() ) {
		if( m_buffer[m_ix] == '0' && m_ix + 2 < m_buffer.length() ) {
			//	0b ̌ 0 or 1 ̂݋̂A
			//	0b123 ̏ꍇA23 10iȂ̂ŁA\Iɂ 0 1 ɌӖ܂Ȃ
			//	0b23 ̏ꍇA0b ̓G[Ȃ̂A܂ŌɏKvƍl
			QChar base = m_buffer[m_ix+1].toLower();
			if( (base == 'b' || base == 'o' || base == 'd')
				&& m_buffer[m_ix+2].isNumber() )
			{
				m_ix += 2;
				while( ++m_ix < m_buffer.length() && m_buffer[m_ix].isNumber() ) {}
				m_tokenText = m_buffer.mid(m_tokenOffset, m_ix - m_tokenOffset);
				return m_tokenType = NUMBER;
			}
			if( base == 'x' && isHexChar(m_buffer[m_ix+2]) ) {
				m_ix += 2;
				while( ++m_ix < m_buffer.length() && isHexChar(m_buffer[m_ix]) ) {}
				m_tokenText = m_buffer.mid(m_tokenOffset, m_ix - m_tokenOffset);
				return m_tokenType = NUMBER;
			}
		}
		//	\d+[.\d+][[e|E][+|-]\d+]
		while( ++m_ix < m_buffer.length() && m_buffer[m_ix].isNumber() ) {}
		if( m_ix + 1 < m_buffer.length() && m_buffer[m_ix] == '.'
			&& m_buffer[m_ix+1].isNumber() )	//	123. ͕s
		{
			while( ++m_ix < m_buffer.length() && m_buffer[m_ix].isNumber() ) {}
		}
		int ix = m_ix;
		if( m_ix < m_buffer.length() && (m_buffer[m_ix] == 'e' || m_buffer[m_ix] == 'E') ) {
			if( ++ix < m_buffer.length() && (m_buffer[ix] == '+' || m_buffer[ix] == '-') )
				++ix;
			if( ix < m_buffer.length() && m_buffer[ix].isNumber() ) {
				while( ++ix < m_buffer.length() && m_buffer[ix].isNumber() ) {}
				m_ix = ix;
			}
		}
		m_tokenText = m_buffer.mid(m_tokenOffset, m_ix - m_tokenOffset);
		return m_tokenType = NUMBER;
	}
	if( isLetterOrNumberOrUnderbar(m_buffer[m_ix]) ) {
		while( ++m_ix < m_buffer.length() && isLetterOrNumberOrUnderbar(m_buffer[m_ix]) ) {}
		if( m_ix < m_buffer.length() ) {
            if( ((m_prevText == "def" || m_prevText == "." || m_prevText == ":")
            		&& (m_buffer[m_ix] == '!' || m_buffer[m_ix] == '?'))
                || (m_prevText == "def" && m_buffer[m_ix] == '=') )
			{
				++m_ix;
			}
		}
		m_tokenText = m_buffer.mid(m_tokenOffset, m_ix - m_tokenOffset);
		return m_tokenType = IDENT;
	}
	if( m_prevText != "def" ) {
		if( m_buffer[m_ix] == '\"' )
			return nextString('\"', 1, DQ_STRING);
		if( m_buffer[m_ix] == '\'' )
			return nextString('\'', 1, SQ_STRING);
		if( m_buffer[m_ix] == '`' )
			return nextString('`', 1, BQ_STRING);
		if( m_buffer[m_ix] == '%' ) {
			uchar type = DQ_STRING;
			int n = 1;
			if( m_ix + 1 < m_buffer.size() ) {
				QChar t = m_buffer[m_ix + 1];
				if( t == 'Q' || t == 'W' || t == 's' )
					++n;
				else if( t == 'q' || t == 'w' ) {
					++n;
					type = SQ_STRING;
				} else if( t == 'r' ) {
					++n;
					type = REGEXP;
				} else if( t == 'x' ) {
					++n;
					type = BQ_STRING;
				}
				QChar qc;
				if( m_ix + n < m_buffer.size() &&
					isAsciiSymbol(qc = m_buffer[m_ix + n]) )
				{
					char quote = (char)m_buffer[m_ix + n].unicode();
					char openQuote = quote;
					int i = QString("([{<").indexOf(quote);
					if( i >= 0 ) quote = ")]}>"[i];
					if( t == 'r' )
						return regexp(quote, n + 1, openQuote);
					else if( t == 's' )
						return percentSymbol(quote, n + 1);
					else
						return nextString(quote, n + 1, type, openQuote);
				}
			}
		}
	}
	if( inBrace() && m_buffer[m_ix] == '}' ) {
		//m_inBrace = false;
		popNestLevelAndQuote();
		uchar rc = nextString();
		m_quoteLength = 1;		//	for '}'
		return rc;
	}
	if( m_ix + 7 < m_buffer.length()
		&& m_buffer.mid(m_ix, 7) == QString("?\\M-\\C-") )
	{
		m_tokenText = m_buffer.mid(m_ix, 8);
		m_ix += 8;
		return m_tokenType = NUMBER;
	}
	if( m_buffer[m_ix] == '?' && m_ix + 4 < m_buffer.length()
		&& m_buffer[m_ix + 1] == '\\'
		&& (m_buffer[m_ix + 2] == 'C' || m_buffer[m_ix + 2] == 'M')
		&& m_buffer[m_ix + 3] == '-' )
	{
		m_tokenText = m_buffer.mid(m_ix, 5);
		m_ix += 5;
		return m_tokenType = NUMBER;
	}
	m_tokenText = m_buffer[m_ix++];
	if( m_prevText == "def" ) {		//	def ̋L̓\bhƂ݂Ȃ
		while( m_ix < m_buffer.length() && m_buffer[m_ix] != '('
				&& isAsciiSymbol(m_buffer[m_ix]) )
		{
			m_tokenText += m_buffer[m_ix++];
		}
		return m_tokenType = SYMBOL;
	}
	if( m_tokenText == "/" ) {
		if( isJustBeforeRegexp(m_prevText, m_prev2Text) ) {
			return regexp('/', 1);
		}
	}
	if( m_tokenText == "$" && m_ix < m_buffer.length() && !m_buffer[m_ix].isLetter() ) {
		m_tokenText += m_buffer[m_ix++];
	} else if( m_tokenText == QChar('<')
		&& m_ix < m_buffer.length()
		&& m_buffer[m_ix] == QChar('<') )	//	<< ͂ЂƂ̃g[Nɂ
	{
		m_tokenText += QChar('<');
		++m_ix;
	} else if( (m_tokenText == QChar('!') || m_tokenText == QChar('=') ||
		m_tokenText == QChar('>') || m_tokenText == QChar('<'))
		&& m_ix < m_buffer.length()
		&& m_buffer[m_ix] == QChar('=') )	//	== ܂ != ܂ >= ܂ <= ͂ЂƂ̃g[Nɂ
	{
		m_tokenText += QChar('=');
		++m_ix;
	} else if( (m_tokenText == ":" || m_tokenText == ".")
				&& m_ix < m_buffer.length() && m_tokenText == m_buffer[m_ix] )
	{
		m_tokenText += m_buffer[m_ix];		//	::
		++m_ix;
	}
	return m_tokenType = SYMBOL;
}
bool ViewTokenizer::inBrace() const
{
	return !m_quoteStack.isEmpty() && m_quoteStack.last().m_type == EXPAND_EXP;
}
