// Lexer.h
// (c) 2004 exeal

#ifndef _LEXER_H_
#define _LEXER_H_
#include "AscensionCommon.h"
#include "..\..\Manah\SmallObject.h"
#include <list>
#include <set>
#include <map>


namespace Ascension {

///	L[[hARgȂǕ݂ނ̃g[Nʂl
typedef short	TokenCookie;
///	ʂȃNbL[l
const TokenCookie	NullCookie = 0;

///	g[N̎
///	@see	EmphaticTextType
enum TokenType {
	///	󔒗ޕ
	TT_WHITESPACE,
	///	^u
	TT_TAB,
	///	L[[h
	TT_KEYWORD,
	///	 (Rg)
	TT_ANNOTATION,
	///	Zq
	TT_OPERATOR,
	///	ʎq
	TT_IDENTIFIER,
	///	
	TT_NUMERAL,
	///	l
	TT_NUMBER,
	///	dp
	TT_SINGLEQUOTATION,
	///	dp
	TT_DOUBLEQUOTATION,
	///	̑ Unicode p
	TT_OTHERQUOTATION,
	///	ASCII 䕶
	TT_ASCII_CONTROL,
	///	Unicode 䕶
	TT_UNICODE_CONTROL,
	///	
	TT_UNSPECIFIED,
	TT_COUNT
};

///	ľ`
enum NumberFormat {
	///	̌ɃAt@xbgAA܂͏_ ()
	NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL,
	///	C++ le
	NF_CPLUSPLUS,
	///	Perl 5 le
	NF_PERL,
	///	Ruby 1.8 le
	NF_RUBY,
	///	VBScript 5.6 le
	NF_VBSCRIPT,
	///	JavaScript 1.5 le
	NF_JAVASCRIPT_15,
	///	JavaScript 2.0 le
	NF_JAVASCRIPT_20,
	NF_COUNT,
};

///	߂̋K
typedef unsigned char	AnnotationRestriction;
///	
const AnnotationRestriction	AR_NONE				= 0x00;
///	ŝ
const AnnotationRestriction	AR_ONLYSTARTOFLINE	= 0x01;
///	󔒗ޕȊO̍ŏ̃g[N̂
const AnnotationRestriction	AR_ONLYHEADOFLINE	= 0x02;

///	͊̃Cxgnh
interface ILexerEventListener {
	///	fXgN^
	virtual			~ILexerEventListener() {}
	/**
	 *	@brief	L[[hRgǉꂽ
	 *
	 *	̌ OnLexerChanged Ăяo
	 *	@param type		g[N̎
	 *	@param nCookie	g[ÑNbL[
	 */
	virtual	void	OnLexerAddedIdentifiedToken(TokenType type, TokenCookie nCookie) = 0;
	///	͂̋Kς
	virtual void	OnLexerChanged() = 0;
	///	͂̋KSč폜ꂽ
	virtual void	OnLexerCleared() = 0;
	/**
	 *	@brief	L[[hRg폜ꂽ
	 *
	 *	̌ OnLexerChanged Ăяo
	 *	@param type		g[N̎
	 *	@param nCookie	g[ÑNbL[
	 */
	virtual	void	OnLexerRemovedIdentifiedToken(TokenType type, TokenCookie nCookie) = 0;
};

///	g[N
///	@see	CTokenLayout
class CToken : public Manah::Windows::CSmallObject<> {
	friend class CLexer;

	// RXgN^
public:
	CToken();
	CToken(length_t i, TokenType type, TokenCookie nCookie);

	// \bh
public:
	TokenCookie	GetCookie() const;
	length_t	GetIndex() const;
	TokenType	GetType() const;
private:
	void	SetCookie(TokenCookie nCookie);
	void	SetType(TokenType type);

	// f[^o
private:
	length_t		m_i;	// sł̈ʒu
	unsigned short	m_type;	// 4rbg (TokenType)A12rbgʒl (TokenCookie)
};

typedef std::list<CToken>					TokenList;
typedef std::list<string_t>					KeywordList;
typedef std::map<TokenCookie, KeywordList>	KeywordsMap;

/**
 *	@brief	͊
 *
 *	啶ʂȂ[h̓L[[hƉsŏIPs߂ł̂ݗLłB
 *	ȊÕg[NɂĂ͏ɑ啶Əʂ
 *
 *	啶ʂȂꍇ̃P[XtHfBO Unicode 4.0 ɂ͏]ĂȂ
 *
 *	AddXXXX œo^g[N̗D揇ʂ͕񒷂̒̂Ȃ
 */
class CLexer : public Manah::CObject {
	// RXgN^
public:
	CLexer(ILexerEventListener* pEventListener);
	virtual ~CLexer();

	// \bh ()
public:
	TokenCookie	AddMultilineAnnotation(const string_t& strStartDelimiter,
					const string_t& strEndDelimiter, AnnotationRestriction ar = AR_NONE);
	TokenCookie	AddSinglelineAnnotation(const string_t& strStartDelimiter, AnnotationRestriction ar = AR_NONE);
	TokenCookie	AddSinglelineAnnotation(const string_t& strStartDelimiter,
					const string_t& strEndDelimiter, AnnotationRestriction ar = AR_NONE);
	TokenCookie	AddKeywords(const std::set<string_t>& setKeywords);
	void		EnableToken(TokenType type, bool bEnable);
	void		EnableUnicodeAlphabets(bool bEnable);
	void		EnableUnicodeWhiteSpaces(bool bEnable);
	void		Freeze();
	bool		GetBracketTraits(char_t chBracket, char_t& chPair, bool& bOpener) const;
	const KeywordsMap&	GetKeywords() const;
	void		IgnoreCase(bool bIgnore);
	void		RemoveAll();
	void		RemoveIdentifiedToken(TokenCookie nCookie) throw(std::invalid_argument);
	void		Reset();
	void		SetAdditionalAlphabets(const char_t* pwszAlphabets, length_t cch);
	void		SetAdditionalAlphabets(const std::set<Manah::Text::CodePoint>& setAlphabets);
	void		SetBrackets(const char_t* pwszBrackets)  throw(std::invalid_argument);
	void		SetNumberFormat(NumberFormat format) throw(std::invalid_argument);
	void		SetOperators(const std::set<string_t>& setOperators);
	void		Unfreeze();

	// \bh ()
public:
	static bool		IsAsciiControl(const char_t* pwsz, length_t cch);
	static bool		IsDigitCodePoint(Manah::Text::CodePoint cp);
	length_t		IsIdentifier(const char_t* pwsz, length_t cch) const;
	bool			IsIdentifierContinueCodePoint(Manah::Text::CodePoint cp) const;
	bool			IsIdentifierStartCodePoint(Manah::Text::CodePoint cp) const;
	bool			IsKeyword(const string_t& str, TokenCookie& nCookie) const;
	length_t		IsOperator(const char_t* pwsz, length_t cch) const;
	static length_t	IsQuotation(const char_t* pwsz, length_t cch, char_t chOpen);
	length_t		IsMultilineAnnotation(const char_t* pwsz, length_t cch,
						AnnotationRestriction ar, TokenCookie& nCookie, bool& bContinued) const;
	static length_t	IsNumerals(const char_t* pwsz, length_t cch);
	length_t		IsNumber(const char_t* pwsz, length_t cch) const;
	length_t		IsSinglelineAnnotation(const char_t* pwsz, length_t cch,
						AnnotationRestriction ar, TokenCookie& nCookie) const;
	static bool		IsUnicodeControl(const char_t* pwsz, length_t cch);
	length_t		IsWhiteSpace(const char_t* pwsz, length_t cch, bool bIncludeTab) const;
	TokenList		Parse(const string_t& str, TokenCookie& nCookie) const;
	TokenCookie		ParseMultilineAnnotation(const string_t& str, TokenCookie nCookie) const;
private:
	CLexer(const CLexer& rhs);
	void		NotifyChange();

	// Jo萔
public:
	static const char_t	m_wszDefaultOpeners[];
	static const char_t	m_wszUnicodeOpeners[];

	// Zq
private:
	operator =(const CLexer& rhs);

	// Jf[^^
private:
	typedef std::list<string_t>	OperatorList;

	///	sŏIPs
	struct TSinglelineAnnotationEndedByBreak {
		string_t	strStartDelimiter;	///	Jn
		AnnotationRestriction	ar;		///	
	};

	///	wf~^ŏIPs
	struct TSinglelineAnnotationEndedByDelimiter {
		string_t	strStartDelimiter;	///	Jn
		string_t	strEndDelimiter;	///	I
		AnnotationRestriction	ar;		///	 (Jnf~^ɂ̂݉e)
	};

	///	s
	struct TMultilineAnnotation {
		string_t	strStartDelimiter;	///	Jn
		string_t	strEndDelimiter;	///	I
		AnnotationRestriction	ar;		///	 (Jnf~^ɂ̂݉e)
	};

	typedef std::map<TokenCookie, TSinglelineAnnotationEndedByBreak>		SAnnotationBMap;
	typedef std::map<TokenCookie, TSinglelineAnnotationEndedByDelimiter>	SAnnotationDMap;
	typedef std::map<TokenCookie, TMultilineAnnotation>						MAnnotationMap;

	// f[^o
private:
	bool				m_bFreezed;
	bool				m_bIgnoreCase;
	bool				m_bEnableUnicodeAlphabets;
	bool				m_bEnableUnicodeWhiteSpaces;
	bool				m_arrEnabledTokenTypes[TT_COUNT];
	NumberFormat		m_numberFormat;
	char_t*				m_pwszBrackets;
	static TokenCookie	m_nCookie;
	ILexerEventListener*				m_pEventListener;
	std::list<Manah::Text::CodePoint>	m_listAdditionalAlphabets;	// At@xbgƂ݂ȂR[h|Cg

	KeywordsMap						m_mapKeywords;		// L[[hQ
	SAnnotationBMap					m_mapSAnnotationBs;	// Ps
	SAnnotationDMap					m_mapSAnnotationDs;	// Ps
	MAnnotationMap					m_mapMAnnotations;	// s
	std::map<char_t, OperatorList>	m_mapOperators;		// Zq
};


// CToken class inline implementation
/////////////////////////////////////////////////////////////////////////////

///	ftHgRXgN^
inline CToken::CToken() {
}

///	RXgN^
inline CToken::CToken(length_t i, TokenType type, TokenCookie nCookie)
		: m_i(i), m_type((type << 12) | nCookie) {
}

///	g[NɊ֘AtꂽNbL[lԂ (RgAL[[ĥݗL)
inline TokenCookie CToken::GetCookie() const {
	AssertValid();
	return m_type & 0x0FFF;
}

///	͕̒ł̐擪̈ʒuԂ
inline length_t CToken::GetIndex() const {
	AssertValid();
	return m_i;
}

///	g[N̎ނԂ
inline TokenType CToken::GetType() const {
	AssertValid();
	return static_cast<TokenType>(m_type >> 12);
}

inline void CToken::SetCookie(TokenCookie nCookie) {
	AssertValid();
	m_type &= 0xF000;
	m_type |= 0x0FFF & nCookie;
}

inline void CToken::SetType(TokenType type) {
	AssertValid();
	m_type &= 0x0FFF;
	m_type |= (type & 0x000F) << 12;
}

// CLexer class inline implementation
/////////////////////////////////////////////////////////////////////////////

#define ENABLE_SWITCH(a, b)	\
	do {					\
		if(a != b) {		\
			a = b;			\
			NotifyChange();	\
		}					\
	} while(false)

/**
 *	g[NL/ɂ
 *	@param type		g[N̎
 *	@param bEnable	Lɂꍇ true
 */
inline void CLexer::EnableToken(TokenType type, bool bEnable) {
	AssertValid();
	ENABLE_SWITCH(m_arrEnabledTokenTypes[type], bEnable);
}

///	Unicode At@xbgAt@xbgƂĎgp邩ݒ肷
inline void CLexer::EnableUnicodeAlphabets(bool bEnable) {
	AssertValid();
	ENABLE_SWITCH(m_bEnableUnicodeAlphabets, bEnable);
}

///	Unicode 󔒗ޕƂĎgp邩ݒ肷
inline void CLexer::EnableUnicodeWhiteSpaces(bool bEnable) {
	AssertValid();
	ENABLE_SWITCH(m_bEnableUnicodeWhiteSpaces, bEnable);
}

///	ݒ肪ύXĂCxgnhɒʒmȂ悤ɂ
inline void CLexer::Freeze() {
	AssertValid();
	m_bFreezed = true;
}

///	L[[hQԂ
inline const KeywordsMap& CLexer::GetKeywords() const {
	AssertValid();
	return m_mapKeywords;
}

/**
 *	啶ʂ邩ǂ̐ݒ
 *	@param bIgnore	ʂȂꍇ true
 */
inline void CLexer::IgnoreCase(bool bIgnore) {
	AssertValid();
	ENABLE_SWITCH(m_bIgnoreCase, bIgnore);
}

/**
 *	ASCII 䕶ǂԂ
 *	@param pwsz	ׂ镶ւ̃|C^
 *	@param cch	
 *	@return		ASCII 䕶̏ꍇ true Ԃ
 */
inline bool CLexer::IsAsciiControl(const char_t* pwsz, length_t cch) {
	assert(pwsz != 0);

	if(cch == 0)
		return false;
	return iswcntrl(*pwsz) != 0;
}

/**
 *	𔻒肷
 *	@param cp	ׂR[h|Cg
 *	@return		ł true
 *	@see		CLexer::IsNumerals
 */
inline bool CLexer::IsDigitCodePoint(Manah::Text::CodePoint cp) {
#if ASCENSION_UNICODE_VERSION != 0x0400
#error This code is based on old version of Unicode.
#endif
	return (cp >= L'0' && cp <= L'9') || Manah::Text::ToAsciiDigit(cp) != cp;
}

/**
 *	pň͂܂ꂽ̒Ԃ (pɓ)B
 *	GXP[vɂ̓obNXbVgp
 *	@param pwsz		ׂ镶
 *	@param cch		
 *	@param chOpen	Jp
 *	@return			p̒BȂ0BĂȂ <var>cch</var> 𓯂lԂ
 */
inline length_t CLexer::IsQuotation(const char_t* pwsz, length_t cch, char_t chOpen) {
	assert(pwsz != 0);

	char_t	chClose;

#if ASCENSION_UNICODE_VERSION != 0x0400
#error This code is based on old version of Unicode.
#endif
	switch(chOpen) {
	case 0x0022:	chClose = 0x0022;	break;
	case 0x0027:	chClose = 0x0027;	break;
	case 0x00AB:	chClose = 0x00BB;	break;
	case 0x2018:	chClose = 0x2019;	break;
//	case 0x201A:	chClose = 0x????;	break;
	case 0x201C:	chClose = 0x201D;	break;
//	case 0x201E:	chClose = 0x????;	break;
//	case 0x201F:	chClose = 0x????;	break;
	case 0x2039:	chClose = 0x203A;	break;
	case 0x300C:	chClose = 0x300D;	break;
	case 0x300E:	chClose = 0x300F;	break;
//	case 0x301D:	chClose = 0x301E or 0x301F;	break;
	case 0xFE41:	chClose = 0xFE42;	break;
	case 0xFE43:	chClose = 0xFE44;	break;
	case 0xFF62:	chClose = 0xFF63;	break;
	default:		return 0;
	}

	if(*pwsz != chOpen)
		return 0;
	for(length_t i = 1; i < cch; ++i) {
		if(*(pwsz + i) == L'\\')	// ͖̕
			++i; 
		else if(*(pwsz + i) == chClose)
			return i + 1;
	}
	return cch;
}

/**
 *	̒Ԃ
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		p̒BȂ0
 *	@see		CLexer::IsDigitCodePoint
 */
inline length_t CLexer::IsNumerals(const char_t* pwsz, length_t cch) {
	assert(pwsz != 0);

	using namespace Manah::Text;

	CodePoint	cp;
	for(length_t i = 0; i < cch; ++i) {
		if(IsUTF16HighSurrogate(pwsz[i])
				&& i < cch - 1
				&& IsUTF16LowSurrogate(pwsz[i + 1]))
			cp = DecodeUTF16SurrogatePairToCodePoint(pwsz + i, cch - i);
		else
			cp = pwsz[i];
		if(CLexer::IsDigitCodePoint(cp)) {
			if(cp >= 0x010000)
				++i;
			continue;
		}
		return i;
	}
	return cch;
}

///	ύXCxgnhɒʒm
inline void CLexer::NotifyChange() {
	AssertValid();
	if(!m_bFreezed && m_pEventListener != 0)
		m_pEventListener->OnLexerChanged();
}

/**
 *	͂Ŏgl`ݒ肷
 *	@param format	Vl`
 */
inline void CLexer::SetNumberFormat(NumberFormat format) throw(std::invalid_argument) {
	AssertValid();
	if(format >= NF_COUNT)
		throw std::invalid_argument("");
	ENABLE_SWITCH(m_numberFormat, format);
}

///	CLexer::Freeze ɂ铀ACxgnhɒʒm
inline void CLexer::Unfreeze() {
	AssertValid();
	ENABLE_SWITCH(m_bFreezed, false);
}

#undef ENABLE_SWITCH

} // namespace Ascension

#endif /* _LEXER_H_ */

/* [EOF] */