// Lexer.cpp
// (c) 2004 exeal

#include "StdAfx.h"
#include "Lexer.h"
#include "TextSearcher.h"	// CBoundarySearcher::IsFirstCharacterOfCluster
#include <algorithm>		// std::binary_search

using namespace Ascension;
using namespace std;
using namespace Manah::Text;


namespace {
	bool gCompareKeyword(const string_t& str1, const string_t& str2) {
		return ::StrCmpW(str1.c_str(), str2.c_str()) < 0;
	}
	bool gCompareKeywordNoCase(const string_t& str1, const string_t& str2) {
		return ::StrCmpIW(str1.c_str(), str2.c_str()) < 0;
	}
	bool gCompareLength(const string_t& str1, const string_t& str2) {
		return str1.length() > str2.length();
	}
	pair<char_t, char_t>	garrBracketPairs[] = {	// Unicode 4.0  Ps APe
		make_pair(0x0028, 0x0029),	// Parenthesis
		make_pair(0x005B, 0x005D),	// Square Bracket
		make_pair(0x007B, 0x007D),	// Curly Bracket
		make_pair(0x0F3A, 0x0F3B),	// Tibetan Mark Gug Rtags Gyon and Gyas
		make_pair(0x0F3C, 0x0F3D),	// Tibetan Mark Ang Khang Gyon and Gyas
		make_pair(0x169B, 0x169C),	// Ogham Feather Maek and reversed one
//		make_pair(0x201A, 0x????),	// Single Low-9 Quotation Mark
//		make_pair(0x201E, 0x????),	// Double Low-9 Quotation Mark
		make_pair(0x2045, 0x2046),	// Square Bracket With Quill
		make_pair(0x207D, 0x207E),	// Superscript Parenthesis
		make_pair(0x208D, 0x208E),	// Subscript Parenthesis
		make_pair(0x2329, 0x232A),	// Pointing Angle Bracket
		make_pair(0x23B4, 0x23B5),	// Square Bracket (top/bottom)
		make_pair(0x2768, 0x2769),	// Medium Parenthesis Ornament
		make_pair(0x276A, 0x276B),	// Medium Flattened Parenthesis Ornament
		make_pair(0x276C, 0x276D),	// Medium Pointing Angle Bracket Ornament
		make_pair(0x276E, 0x276F),	// Heavy Pointing Angle Quotation Mark Ornament
		make_pair(0x2770, 0x2771),	// Heavy Pointing Angle Bracket Ornament
		make_pair(0x2772, 0x2773),	// Light Tortoise Shell Bracket Ornament
		make_pair(0x2774, 0x2775),	// Medium Curly Bracket Ornament
		make_pair(0x27E6, 0x27E7),	// Mathematical White Square Bracket
		make_pair(0x27E8, 0x27E9),	// Mathematical Angle Bracket
		make_pair(0x27EA, 0x27EB),	// Mathematical Double Angle Bracket
		make_pair(0x2983, 0x2984),	// White Curly Barcket
		make_pair(0x2985, 0x2986),	// White Parenthesis
		make_pair(0x2987, 0x2988),	// Z Notation Image Bracket
		make_pair(0x2989, 0x298A),	// Z Notation Binding Bracket
		make_pair(0x298B, 0x298C),	// Square Bracket With Underbar
		make_pair(0x298D, 0x298E),	// Left Square Bracket With Tick In Top Corner and Right ... Bottom
		make_pair(0x298F, 0x2990),	// Left Square Bracket With Tick In Bottom Corner and Right ... Top
		make_pair(0x2991, 0x2992),	// Angle Bracket With Dot
		make_pair(0x2993, 0x2994),	// Arc Less-Than Bracket
		make_pair(0x2995, 0x2996),	// Double Arc Greater-Than Bracket
		make_pair(0x2997, 0x2998),	// Black Tortoise Shell Bracket
		make_pair(0x29D8, 0x29D9),	// Wiggly Fence
		make_pair(0x29DA, 0x29DB),	// Double Wiggly Fence
		make_pair(0x29FC, 0x29FD),	// Pointing Curved Angle Bracket
		make_pair(0x3008, 0x3009),	// Angle Bracket
		make_pair(0x300A, 0x300B),	// Double Angle Bracket
		make_pair(0x300C, 0x300D),	// Corner Bracket
		make_pair(0x300E, 0x300F),	// White Corner Bracket
		make_pair(0x3010, 0x3011),	// Black Lenticular Bracket
		make_pair(0x3014, 0x3015),	// Tortoise Shell Bracket
		make_pair(0x3016, 0x3017),	// White Lenticular Bracket
		make_pair(0x3018, 0x3019),	// White Tortoise Shell Bracket
		make_pair(0x301A, 0x301B),	// White Square Bracket
		make_pair(0x301D, 0x301F),	// Double Prime Quotation Mark and reversed one
//		make_pair(0x????, 0x301E),	// Double Prime Quotation Mark (deprecated: mistaken analogue)
		make_pair(0xFD3E, 0xFD3F),	// Ornate Parenthesis
		make_pair(0xFE35, 0xFE36),	// Presentation Form For Vertical Parenthesis
		make_pair(0xFE37, 0xFE38),	// - Curly Bracket
		make_pair(0xFE39, 0xFE3A),	// - Tortoise Shell Bracket
		make_pair(0xFE3B, 0xFE3C),	// - Black Lenticular Bracket
		make_pair(0xFE3D, 0xFE3E),	// - Double Angle Bracket
		make_pair(0xFE3F, 0xFE40),	// - Angle Bracket
		make_pair(0xFE41, 0xFE42),	// - Corner Bracket
		make_pair(0xFE43, 0xFE44),	// - White Corner Bracket
		make_pair(0xFE45, 0xFE46),	// Sesame Dot and White one
		make_pair(0xFE47, 0xFE48),	// - Square Bracket
		make_pair(0xFE59, 0xFE5A),	// Small Parenthesis
		make_pair(0xFE5B, 0xFE5C),	// Small Curly Bracket
		make_pair(0xFE5D, 0xFE5E),	// Small Tortoise Shell Bracket
		make_pair(0xFF08, 0xFF09),	// Fullwidth Parenthesis
		make_pair(0xFF3B, 0xFF3D),	// Fullwidth Square Bracket
		make_pair(0xFF5B, 0xFF5D),	// Fullwidth Curly Bracket
		make_pair(0xFF5F, 0xFF60),	// Fullwidth White Parenthesis
		make_pair(0xFF62, 0xFF63),	// Halfwidth Corner Bracket
		make_pair(0x0000, 0x0000)	// _~[
	};
}


// CLexer class implementation
/////////////////////////////////////////////////////////////////////////////

const char_t	CLexer::m_wszDefaultOpeners[] = L"([{";
const char_t	CLexer::m_wszUnicodeOpeners[] = L"([{\x0F3A\xF3C\x169B\x2045\x207D\x208D\x2329\x23B4"
												L"\x2768\x276A\x276C\x276E\x2770\x2772\x2774\x27E6"
												L"\x27E8\x27EA\x2983\x2985\x2987\x2989\x298B\x298D"
												L"\x298F\x2991\x2993\x2995\x2997\x29D8\x29DA\x29FC"
												L"\x3008\x300A\x300C\x300E\x3010\x3014\x3016\x3018"
												L"\x301A\x301D\xFD3E\xFE35\xFE37\xFE39\xFE3B\xFE3D"
												L"\xFE3F\xFE41\xFE43\xFE45\xFE47\xFE59\xFE5B\xFE5D"
												L"\xFF08\xFF3B\xFF5B\xFF5F\xFF62";
TokenCookie		CLexer::m_nCookie = NullCookie + 1;

/**
 *	RXgN^
 *	@param pEventListener	Cxgnh (null ł悢)
 */
CLexer::CLexer(ILexerEventListener* pEventListener) :
		m_bFreezed(false), m_bIgnoreCase(false),
		m_bEnableUnicodeAlphabets(true), m_bEnableUnicodeWhiteSpaces(true),
		m_numberFormat(NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL),
		m_pEventListener(pEventListener) {
	for(size_t i = 0; i < TT_COUNT; ++i)
		m_arrEnabledTokenTypes[i] = true;
	m_pwszBrackets = new char_t[7];
	wcscpy(m_pwszBrackets, CLexer::m_wszDefaultOpeners);
}

///	fXgN^
CLexer::~CLexer() {
	delete[] m_pwszBrackets;
}

/**
 *	L[[h`ǉ
 *	@param setKeywords	ǉL[[h̏W
 *	@return				NbL[
 */
TokenCookie CLexer::AddKeywords(const set<string_t>& setKeywords) {
	AssertValid();

	KeywordList	listKeywords;

	for(set<string_t>::const_iterator it = setKeywords.begin(); it != setKeywords.end(); ++it)
		listKeywords.push_back(*it);
	listKeywords.sort(m_bIgnoreCase ? gCompareKeywordNoCase : gCompareKeyword);
	m_mapKeywords.insert(pair<TokenCookie, KeywordList>(CLexer::m_nCookie, listKeywords));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_KEYWORD, CLexer::m_nCookie);
	NotifyChange();
	return CLexer::m_nCookie++;
}

/**
 *	sRg`ǉ
 *	@param strStartDelimiter	Jn
 *	@param strEndDelimiter		I
 *	@param ar					Jnf~^̐
 *	@return						NbL[
 */
TokenCookie CLexer::AddMultilineAnnotation(const string_t& strStartDelimiter,
		const string_t& strEndDelimiter, AnnotationRestriction ar /* = AR_NONE */) {
	AssertValid();

	const TMultilineAnnotation	annotation = {strStartDelimiter, strEndDelimiter, ar};

	m_mapMAnnotations.insert(
		pair<TokenCookie, TMultilineAnnotation>(CLexer::m_nCookie, annotation));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_ANNOTATION, CLexer::m_nCookie);
	NotifyChange();

	return CLexer::m_nCookie++;
}

/**
 *	sŏIPsߒ`ǉ
 *	@param strStartDelimiter	Jn
 *	@param ar					Jnf~^̐
 *	@return						NbL[
 */
TokenCookie CLexer::AddSinglelineAnnotation(
		const string_t& strStartDelimiter, AnnotationRestriction ar /* = AR_NONE */) {
	AssertValid();

	const TSinglelineAnnotationEndedByBreak	annotation = {strStartDelimiter, ar};

	m_mapSAnnotationBs.insert(
		pair<TokenCookie, TSinglelineAnnotationEndedByBreak>(CLexer::m_nCookie, annotation));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_ANNOTATION, CLexer::m_nCookie);
	NotifyChange();

	return CLexer::m_nCookie++;
}

/**
 *	sŏIPsߒ`ǉ
 *	@param strStartDelimiter	Jn
 *	@param strEndDelimiter		I
 *	@param ar					Jnf~^̐
 *	@return						NbL[
 */
TokenCookie CLexer::AddSinglelineAnnotation(const string_t& strStartDelimiter,
		const string_t& strEndDelimiter, AnnotationRestriction ar /* = AR_NONE */) {
	AssertValid();

	const TSinglelineAnnotationEndedByDelimiter	annotation = {
		strStartDelimiter, strEndDelimiter, ar
	};

	m_mapSAnnotationDs.insert(
		pair<TokenCookie, TSinglelineAnnotationEndedByDelimiter>(CLexer::m_nCookie, annotation));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_ANNOTATION, CLexer::m_nCookie);
	NotifyChange();

	return CLexer::m_nCookie++;
}

/**
 *	w肳ꂽʂ̓Ԃ
 *	@param chBracket	ׂ銇
 *	@param chPair		[out] Ί
 *	@param bOpener		[out] <var>chBracket</var> JʂȂ true
 *	@return				<var>chBracket</var> gp\ȊʂɊ܂܂Ă true
 */
bool CLexer::GetBracketTraits(char_t chBracket, char_t& chPair, bool& bOpener) const {
	const char_t*	pwszFound = wcschr(m_pwszBrackets, chBracket);

	if(pwszFound == 0)
		return false;
	else if((pwszFound - m_pwszBrackets) % 2 == 0) {
		chPair = pwszFound[1];
		bOpener = true;
	} else {
		chPair = pwszFound[-1];
		bOpener = false;
	}
	return true;

}

/**
 *	ʎq̒Ԃ
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		ʎq\AĂ钷BʎqłȂ0
 *	@see		CLexer::IsIdentifierContinueChar, CLexer::IsIdentifierStartChar
 */
length_t CLexer::IsIdentifier(const char_t* pwsz, length_t cch) const {
	AssertValid();
	assert(pwsz != 0);

	CodePoint	cp;
	for(length_t i = 0; i < cch; ++i) {
		if(IsUTF16HighSurrogate(pwsz[i])
				&& i < cch - 1
				&& IsUTF16LowSurrogate(pwsz[i + 1]))
			cp = DecodeUTF16SurrogatePairToCodePoint(pwsz + i, cch - i);
		else
			cp = pwsz[i];
		if((i == 0 && IsIdentifierStartCodePoint(cp))
				|| IsIdentifierContinueCodePoint(cp)) {
			if(cp >= 0x010000)
				++i;
			continue;
		}
		return i;
	}
	return cch;
}

/**
 *	ʎq\𔻒肷B̃\bh
 *	Unicode  ID_Continue vpeB̃XgɊÂĂ
 *	@param cp	ׂR[h|Cg
 *	@return		ʎq\ǂ
 *	@see		CLexer::IsIdentifier, CLexer::IsIdentifierStartChar
 */
bool CLexer::IsIdentifierContinueCodePoint(CodePoint cp) const {
	AssertValid();

	// ʕނ Mn (Mark, Non-Spacing)AMc (Mark, Spacing Combining)A
	// Nd (Number, Decimal)APc (Punctuation, Connector) ̂ꂩłR[h|Cg
	// unicat.pl 莩 (Unicode 4.0)
	static const CodePoint	arrIDContinue[] = {
#if(ASCENSION_UNICODE_VERSION != 0x0400)
#error Included file version differs from Ascension Unicode version. Update correspoding file.
#endif
#include "script\Lexer_IdentifierContinue_4_0"
	};

	if(m_bEnableUnicodeAlphabets)
		return IsIdentifierStartCodePoint(cp)
			|| binary_search(arrIDContinue, arrIDContinue + sizeof(arrIDContinue) / sizeof(CodePoint), cp);
	else
		return IsIdentifierStartCodePoint(cp)
			|| (cp == L'_')
			|| (cp >= L'0' && cp <= L'9');
}

/**
 *	ʎqJn𔻒肷B̃\bh
 *	Unicode  ID_Start vpeB̃XgɊÂĂ
 *	@param cp	ׂR[h|Cg
 *	@return		ʎqJnǂ
 *	@see		CLexer::IsIdentifier, CLexer::IsIdentifierContinueChar
 */
bool CLexer::IsIdentifierStartCodePoint(CodePoint cp) const {
	AssertValid();

	// DerivedCoreProperties.txt  ID_Start vpeBR[h|Cg
	// idstart.pl 莩 (Unicode 4.0)
	static const CodePoint	arrIDStart[] = {
#if(ASCENSION_UNICODE_VERSION != 0x0400)
#error Included file version differs from Ascension Unicode version. Update correspoding file.
#endif
#include "script\Lexer_IdentifierStart_4_0"
	};

	if(binary_search(m_listAdditionalAlphabets.begin(), m_listAdditionalAlphabets.end(), cp))
		return true;
	if(m_bEnableUnicodeAlphabets)
		return binary_search(arrIDStart,
				arrIDStart + sizeof(arrIDStart) / sizeof(CodePoint), cp)
			|| (cp >= 0x1401 && cp <= 0x166C)		// Canadian Syllabics
			|| (cp >= 0x3400 && cp <= 0x4DB5)		// CJK Unified Ideograph
			|| (cp >= 0x4E00 && cp <= 0x9FA5)		// CJK Unified Ideograph
			|| (cp >= 0xA000 && cp <= 0xA48C)		// Yi Syllable
			|| (cp >= 0xAC00 && cp <= 0xD7A3)		// Hangul Syllable
			|| (cp >= 0x20000 && cp <= 0x2A6D6)		// CJK Unified Ideograph
			|| (cp >= 0x2F800 && cp <= 0x2FA1D);	// CJK Compatibility Ideograph
	else
		return (cp >= L'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z');
}

/**
 *	L[[h̒Ԃ
 *	@param str		ׂ镶
 *	@param nCookie	[out] L[[hɊ֘AtꂽNbL[l
 *	@return			L[[hłꍇ true
 */
bool CLexer::IsKeyword(const string_t& str, TokenCookie& nCookie) const {
	AssertValid();
	assert(!str.empty());

	map<TokenCookie, KeywordList>::const_iterator	it;
	for(it = m_mapKeywords.begin(); it != m_mapKeywords.end(); ++it) {
		if(binary_search(it->second.begin(), it->second.end(), str,
				m_bIgnoreCase ? gCompareKeywordNoCase : gCompareKeyword)) {
			nCookie = it->first;
			return true;
		}
	}
	return false;
}

/**
 *	sߕ̒Ԃ (JnAIɓ)
 *	@param pwsz			ׂ镶
 *	@param cch			
 *	@param ar			ׂ悤ƂĂ镶񂪖Ă鐧
 *	@param nCookie		[in, out] ׂ镶̒OsRgłꍇ
 *						̃RgɊ֘AtꂽNbL[ݒ肵ĂB\bhďoA
 *						񂪕sRgł΂Ɋ֘AtꂽNbL[̒lݒ肳
 *						(o͂ɂ NullCookie ͕sRgłȂƂ\̂Ɏg)
 *	@param bContinued	sRg̍sŏIĂȂ true
 *	@return				PsRgł true
 *	@exception invalid_argument	<var>nCookie</var> ȂƂX[
 */
length_t CLexer::IsMultilineAnnotation(const char_t* pwsz, length_t cch,
		AnnotationRestriction ar, TokenCookie& nCookie, bool& bContinued) const throw(invalid_argument) {
	AssertValid();
	assert(pwsz != 0);

	length_t	i = 0;

	bContinued = false;
	if(nCookie == NullCookie) {	// s߂̊Jnf~^T
		for(MAnnotationMap::const_iterator it =
				m_mapMAnnotations.begin(); it != m_mapMAnnotations.end(); ++it) {
			if(((it->second.ar & ar) != it->second.ar)
					|| it->second.strStartDelimiter.length() > cch)
				continue;
			else if(wcsncmp(pwsz,
					it->second.strStartDelimiter.c_str(),
					it->second.strStartDelimiter.length()) == 0) {
				nCookie = it->first;
				i = it->second.strStartDelimiter.length();
				break;
			}
		}
		if(nCookie == NullCookie)
			return 0;
	}
	bContinued = true;
	
	// sRg̏If~^T
	MAnnotationMap::const_iterator	it = m_mapMAnnotations.find(nCookie);

	if(it == m_mapMAnnotations.end())
		throw invalid_argument("Input cookie value is invalid.");

	length_t	iEnd = string_t(pwsz, cch).find(it->second.strEndDelimiter, i);
	if(iEnd == string_t::npos)
		return cch;
	bContinued = false;

	return iEnd + it->second.strEndDelimiter.length();
}

/**
 *	l̒Ԃ
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		l̒BlłȂ0
 */
length_t CLexer::IsNumber(const char_t* pwsz, length_t cch) const {
	AssertValid();
	assert(pwsz != 0);

	if(cch == 0 || !toBoolean(iswdigit(pwsz[0])))	// 1
		return 0;

	if(m_numberFormat == NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL) {	// 
		CodePoint	cp;
		for(length_t i = 1; i < cch; ++i) {
			if(i < cch - 1
					&& IsUTF16HighSurrogate(pwsz[i])
					&& IsUTF16LowSurrogate(pwsz[i + 1]))
				cp = DecodeUTF16SurrogatePairToCodePoint(pwsz + i, cch - i);
			else
				cp = pwsz[i];
			if(pwsz[i] == L'.' || IsIdentifierContinueCodePoint(cp)) {
				if(cp > 0xFFFF)
					++i;
			}
			else
				return i;
		}
		return cch;
	} else if(m_numberFormat == NF_CPLUSPLUS) {	// C++
	} else if(m_numberFormat == NF_PERL) {	// Perl 5
	} else if(m_numberFormat == NF_RUBY) {	// Ruby 1.8
	} else if(m_numberFormat == NF_VBSCRIPT) {	// VBScript 5.6
	} else if(m_numberFormat == NF_JAVASCRIPT_15) {	// JavaScript 1.5 (ECMAScript 3)
	} else if(m_numberFormat == NF_JAVASCRIPT_20) {	// JavaScript 2.0 (ECMAScript 4)
	}

	return 0;
}

/**
 *	Zq̒Ԃ
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		Zq̒BZqłȂ0
 */
length_t CLexer::IsOperator(const char_t* pwsz, length_t cch) const {
	AssertValid();
	assert(pwsz != 0);

	if(cch == 0)
		return 0;

	map<char_t, OperatorList>::const_iterator	it = m_mapOperators.find(pwsz[0]);

	if(it == m_mapOperators.end())
		return 0;

	const OperatorList&	listOps = it->second;
	for(OperatorList::const_iterator it =
			listOps.begin(); it != listOps.end(); ++it) {
		if(it->length() > cch)
			continue;
		else if(wcsncmp(pwsz, it->c_str(), it->length()) == 0)
			return it->length();
	}
	return 0;
}

/**
 *	Ps߂Ԃ
 *	@param pwsz		ׂ镶
 *	@param cch		
 *	@param ar		ׂ悤ƂĂ镶񂪖Ă鐧
 *	@param nCookie	[out] PsRgɊ֘AtꂽNbL[l
 *	@return			PsRgł΂̒BȊO0
 */
length_t CLexer::IsSinglelineAnnotation(const char_t* pwsz,
		length_t cch, AnnotationRestriction ar, TokenCookie& nCookie) const {
	AssertValid();

	// sŏI钍߂
	for(SAnnotationBMap::const_iterator it =
			m_mapSAnnotationBs.begin(); it != m_mapSAnnotationBs.end(); ++it) {
		if(((it->second.ar & ar) != it->second.ar)
				|| it->second.strStartDelimiter.length() > cch)
			continue;
		else if(m_bIgnoreCase &&
				::StrCmpNIW(pwsz, it->second.strStartDelimiter.c_str(),
					it->second.strStartDelimiter.length()) == 0) {
			nCookie = it->first;
			return cch;
		} else if(!m_bIgnoreCase &&
				::StrCmpNW(pwsz, it->second.strStartDelimiter.c_str(),
					it->second.strStartDelimiter.length()) == 0) {
			nCookie = it->first;
			return cch;
		}
	}

	// wf~^ŏI钍߂
	SAnnotationDMap::const_iterator	itD;
	for(itD = m_mapSAnnotationDs.begin(); itD != m_mapSAnnotationDs.end(); ++itD) {
		if(((itD->second.ar & ar) != itD->second.ar)
				|| itD->second.strStartDelimiter.length() > cch)
			continue;
		else if(::StrCmpNW(pwsz,
				itD->second.strStartDelimiter.c_str(),
				itD->second.strStartDelimiter.length()) == 0) {
			nCookie = itD->first;
			break;
		}
	}
	if(itD == m_mapSAnnotationDs.end())
		return 0;
	const char_t*	pwszEnd = ::StrStrW(
		pwsz + itD->second.strStartDelimiter.length(), itD->second.strEndDelimiter.c_str());
	return (pwszEnd != 0) ? pwszEnd - pwsz + itD->second.strEndDelimiter.length() : cch;
}

/**
 *	Unicode 䕶ǂԂ
 *	@param pwsz	ׂ镶ւ̃|C^
 *	@param cch	
 *	@return		Unicode 䕶̏ꍇ true Ԃ
 */
bool CLexer::IsUnicodeControl(const char_t* pwsz, length_t cch) {
	assert(pwsz != 0);

#if ASCENSION_UNICODE_VERSION != 0x0400
#error This code is based on old version of Unicode.
#endif
	// ʕނ Cc (Other, Control) ACf (Other, Format) łR[h|Cg
	// unicat.pl 莩 (Unicode 4.0)
	static const CodePoint	arrCc[] = {
		0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
		0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
		0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
		0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
		0x007F, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086,
		0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E,
		0x008F, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096,
		0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E,
		0x009F,
	};
	static const CodePoint	arrCf[] = {
		0x00AD, 0x0600, 0x0601, 0x0602, 0x0603, 0x06DD, 0x070F, 0x17B4,
		0x17B5, 0x200C, 0x200D, 0x200E, 0x200F, 0x202A, 0x202B, 0x202C,
		0x202D, 0x202E, 0x2060, 0x2061, 0x2062, 0x2063, 0x206A, 0x206B,
		0x206C, 0x206D, 0x206E, 0x206F, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB,
		0x1D173, 0x1D174, 0x1D175, 0x1D176, 0x1D177, 0x1D178, 0x1D179, 0x1D17A,
		0xE0001, 0xE0020, 0xE0021, 0xE0022, 0xE0023, 0xE0024, 0xE0025, 0xE0026,
		0xE0027, 0xE0028, 0xE0029, 0xE002A, 0xE002B, 0xE002C, 0xE002D, 0xE002E,
		0xE002F, 0xE0030, 0xE0031, 0xE0032, 0xE0033, 0xE0034, 0xE0035, 0xE0036,
		0xE0037, 0xE0038, 0xE0039, 0xE003A, 0xE003B, 0xE003C, 0xE003D, 0xE003E,
		0xE003F, 0xE0040, 0xE0041, 0xE0042, 0xE0043, 0xE0044, 0xE0045, 0xE0046,
		0xE0047, 0xE0048, 0xE0049, 0xE004A, 0xE004B, 0xE004C, 0xE004D, 0xE004E,
		0xE004F, 0xE0050, 0xE0051, 0xE0052, 0xE0053, 0xE0054, 0xE0055, 0xE0056,
		0xE0057, 0xE0058, 0xE0059, 0xE005A, 0xE005B, 0xE005C, 0xE005D, 0xE005E,
		0xE005F, 0xE0060, 0xE0061, 0xE0062, 0xE0063, 0xE0064, 0xE0065, 0xE0066,
		0xE0067, 0xE0068, 0xE0069, 0xE006A, 0xE006B, 0xE006C, 0xE006D, 0xE006E,
		0xE006F, 0xE0070, 0xE0071, 0xE0072, 0xE0073, 0xE0074, 0xE0075, 0xE0076,
		0xE0077, 0xE0078, 0xE0079, 0xE007A, 0xE007B, 0xE007C, 0xE007D, 0xE007E,
		0xE007F,
	};

	const CodePoint	cp = DecodeUTF16SurrogatePairToCodePoint(pwsz, cch);
	return binary_search(arrCc, arrCc + sizeof(arrCc) / sizeof(CodePoint), cp)
		|| binary_search(arrCf, arrCf + sizeof(arrCf) / sizeof(CodePoint), cp);
}

/**
 *	󔒗ޕ̒Ԃ
 *	@param pwsz			ׂ镶
 *	@param cch			
 *	@param bIncludeTab	^u󔒕Ƃ݂Ȃꍇ true
 *	@return		󔒗ޕAĂ钷BzCgXy[XłȂ0
 */
length_t CLexer::IsWhiteSpace(const char_t* pwsz, length_t cch, bool bIncludeTab) const {
	AssertValid();
	assert(pwsz != 0);

#if ASCENSION_UNICODE_VERSION != 0x0400
#error This code is based on old version of Unicode.
#endif
	// ʕނ Zs (Separator, Space) łR[h|Cg
	// unicat.pl 莩 (Unicode 4.0)
	// NOTE: ListProp.txt ɂ WhiteSpace ƂvpeB邪
	// ̔z͂̏W (Ⴆΐ䕶Ȃǂ͊OĂ)
	static const char_t	arrZs[] = {
		0x0020, 0x00A0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003,
		0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B,
		0x202F, 0x205F, 0x3000,
	};

	if(!m_bEnableUnicodeWhiteSpaces) {	// Unicode 󔒗ޕFȂꍇ
		for(length_t i = 0; i < cch; ++i) {
			if(bIncludeTab && pwsz[i] == L'\t')
				continue;
			else if(pwsz[i] != L' ')
				return i;
		}
	} else {	// Unicode 󔒗ޕFꍇ
		for(length_t i = 0; i < cch; ++i) {
			if(bIncludeTab && pwsz[i] == L'\t')
				continue;
			if(!binary_search(arrZs, arrZs + sizeof(arrZs) / sizeof(char_t), pwsz[i]))
				return i;
		}
	}

	return cch;
}

/**
 *	͂Ag[Nɕ
 *	@param str		
 *	@param nCookie	[in, out] ׂ镶̒OsRgłꍇ
 *					̃RgɊ֘AtꂽNbL[ݒ肵ĂB\bhďoA
 *					ׂ̖̕sRgɊ֘AtꂽNbL[̒lݒ肳
 *					(o͂ɂ NullCookie ͕sRgłȂƂ\̂Ɏg)
 *	@return			g[ÑXg
 */
TokenList CLexer::Parse(const string_t& str, TokenCookie& nCookie) const {
	AssertValid();

	TokenList		listTokens;
	CToken			oToken;
	const char_t*	pwsz = str.c_str();
	const length_t	cch = str.length();
	bool			bMCommentContinued;

	oToken.m_i = 0;
	if(nCookie != NullCookie) {	// ׂ镶̑O畡sRgĂꍇ
		oToken.SetCookie(nCookie);
		const length_t	cchToken = IsMultilineAnnotation(pwsz, cch,
			AR_ONLYSTARTOFLINE | AR_ONLYHEADOFLINE, nCookie, bMCommentContinued);
		oToken.SetType(TT_ANNOTATION);
		listTokens.push_back(oToken);
		oToken.m_i = cchToken;
		if(!bMCommentContinued)
			nCookie = NullCookie;
	}
	oToken.SetCookie(NullCookie);

	length_t				i = oToken.m_i;
	length_t				cchToken;
	TokenCookie				nTokenCookie = NullCookie;
	AnnotationRestriction	ar;
	bool					bAppearedUnspace = false;
	while(i < cch) {
		// ⑫K̓K
		ar = AR_NONE;
		if(i == 0)				ar |= AR_ONLYSTARTOFLINE;
		if(!bAppearedUnspace)	ar |= AR_ONLYHEADOFLINE;

		if(m_arrEnabledTokenTypes[TT_TAB]	// ^u
				&& pwsz[i] == L'\t') {
			cchToken = 1;
			oToken.SetType(TT_TAB);
		} else if(m_arrEnabledTokenTypes[TT_WHITESPACE]	// 󔒗ޕ
				&& 0 != (cchToken = IsWhiteSpace(pwsz + i, cch - i, false)))
			oToken.SetType(TT_WHITESPACE);
		else if(m_arrEnabledTokenTypes[TT_NUMBER]	// l
				&& 0 != (cchToken = IsNumber(pwsz + i, cch - i)))
			oToken.SetType(TT_NUMBER);
		else if(m_arrEnabledTokenTypes[TT_SINGLEQUOTATION]	// dp
				&& pwsz[i] == L'\''
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, L'\'')))
			oToken.SetType(TT_SINGLEQUOTATION);
		else if(m_arrEnabledTokenTypes[TT_DOUBLEQUOTATION]	// dp
				&& pwsz[i] == L'\"'
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, L'\"')))
			oToken.SetType(TT_DOUBLEQUOTATION);
		else if(m_arrEnabledTokenTypes[TT_OTHERQUOTATION]	// ̑̈p
				&& pwsz[i] != L'\'' && pwsz[i] != L'\"'
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, pwsz[i])))
			oToken.SetType(TT_OTHERQUOTATION);
		else if(m_arrEnabledTokenTypes[TT_ANNOTATION]	// Ps
				&& IsSinglelineAnnotation(pwsz + i, cch - i, ar, nTokenCookie)) {
			cchToken = cch - i;
			oToken.SetCookie(nTokenCookie);
			oToken.SetType(TT_ANNOTATION);
		} else if(m_arrEnabledTokenTypes[TT_ANNOTATION]	// s
				&& 0 != (cchToken = IsMultilineAnnotation(pwsz + i,
				cch - i, ar, nTokenCookie, bMCommentContinued))) {
			oToken.SetCookie(nTokenCookie);
			oToken.SetType(TT_ANNOTATION);
			if(bMCommentContinued)	// ̍sɑ
				nCookie = nTokenCookie;
		} else if(m_arrEnabledTokenTypes[TT_OPERATOR]	// Zq
				&& 0 != (cchToken = IsOperator(pwsz + i, cch - i)))
			oToken.SetType(TT_OPERATOR);
		else if(m_arrEnabledTokenTypes[TT_NUMERAL]	// 
				&& 0 != (cchToken = IsNumerals(pwsz + i, cch - i)))
			oToken.SetType(TT_NUMERAL);
		else if(m_arrEnabledTokenTypes[TT_ASCII_CONTROL]	// ASCII 䕶
				&& IsAsciiControl(pwsz + i, cch - i)) {
			cchToken = 1;
			oToken.SetType(TT_ASCII_CONTROL);
		} else if(m_arrEnabledTokenTypes[TT_UNICODE_CONTROL]	// Unicode 䕶
				&& IsUnicodeControl(pwsz + i, cch - i)) {
			cchToken = 1;
			oToken.SetType(TT_UNICODE_CONTROL);
		} else if(m_arrEnabledTokenTypes[TT_IDENTIFIER]	// ʎq or L[[h
				&& 0 != (cchToken = IsIdentifier(pwsz + i, cch - i))) {
			if(IsKeyword(string_t(pwsz + i, cchToken), nTokenCookie)) {
				oToken.SetCookie(nTokenCookie);
				oToken.SetType(TT_KEYWORD);
			} else
				oToken.SetType(TT_IDENTIFIER);
		} else {
			cchToken = 1;
			CodePoint	cp;
			while(i + cchToken < cch) {
				cp = DecodeUTF16SurrogatePairToCodePoint(pwsz + i, cch - i);
				if(CBoundarySearcher::IsFirstCharacterOfCluster(cp))
					break;
				cchToken += (cp > 0xFFFF) ? 2 : 1;
			}
			oToken.SetType(TT_UNSPECIFIED);
		}
		oToken.m_i = i;
		listTokens.push_back(oToken);
		i += cchToken;
		nTokenCookie = NullCookie;
		oToken.SetCookie(NullCookie);

		if(oToken.GetType() != TT_WHITESPACE && oToken.GetType() != TT_TAB)
			bAppearedUnspace = true;
	}

	return listTokens;
}

/**
 *	̕s߂̏Ԃ𒲂ׂ
 *	@param str		
 *	@param nCookie	ׂ镶̒O瑱sRgɊ֘AtꂽNbL[B
 *					s߂łȂꍇ NullCookie
 *	@return			ׂ񂪕sIĂȂ΁A
 *					̕s߂Ɋ֘AtꂽNbL[BłȂ NullCookie
 */
TokenCookie CLexer::ParseMultilineAnnotation(const string_t& str, TokenCookie nCookie) const {
	AssertValid();

	if(str.empty())
		return nCookie;

	const char_t*	pwsz = str.c_str();
	const length_t	cch = str.length();
	length_t		i = 0;
	length_t		cchToken;
	TokenCookie		nDummy;
	bool			bMCommentContinued;

	// s߂O̍s瑱Ăꍇ
	if(nCookie != NullCookie) {
		i = IsMultilineAnnotation(pwsz, cch, true, nCookie, bMCommentContinued);
		if(bMCommentContinued)
			return nCookie;
		nCookie = NullCookie;
	}

	while(i < cch) {
		if(m_arrEnabledTokenTypes[TT_ANNOTATION]
				&& 0 != (cchToken = IsSinglelineAnnotation(pwsz + i, cch - i, i == 0, nDummy))) {
			return NullCookie;
		} else if(m_arrEnabledTokenTypes[TT_ANNOTATION]
				&& 0 != (cchToken = IsMultilineAnnotation(pwsz + i,
					cch - i, i == 0, nCookie, bMCommentContinued))) {
			if(bMCommentContinued)
				return nCookie;
			nCookie = NullCookie;
			i += cchToken;
		} else if(m_arrEnabledTokenTypes[TT_SINGLEQUOTATION]
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, L'\'')))
			i += cchToken;
		else if(m_arrEnabledTokenTypes[TT_DOUBLEQUOTATION]
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, L'\"')))
			i += cchToken;
		else if(m_arrEnabledTokenTypes[TT_OTHERQUOTATION]
				&& 0 != (cchToken = IsQuotation(pwsz + i, cch - i, pwsz[i])))
			i += cchToken;
		else
			++i;
	}
	return NullCookie;
}

///	o^ĂL[[hA߁AZqSč폜
void CLexer::RemoveAll() {
	AssertValid();

	m_mapKeywords.clear();
	m_mapMAnnotations.clear();
	m_mapSAnnotationBs.clear();
	m_mapSAnnotationDs.clear();
	m_mapOperators.clear();
	NotifyChange();
}

/**
 *	AddXXXX œo^g[N폜
 *	@param nCookie	g[ÑNbL[l
 */
void CLexer::RemoveIdentifiedToken(TokenCookie nCookie) throw(invalid_argument) {
	AssertValid();

	TokenType	type = TT_COUNT;

	map<TokenCookie, KeywordList>::iterator	itKeywords = m_mapKeywords.find(nCookie);
	if(itKeywords != m_mapKeywords.end()) {
		m_mapKeywords.erase(itKeywords);
		type = TT_KEYWORD;
	} else {
		MAnnotationMap::iterator	itMAnnotations = m_mapMAnnotations.find(nCookie);
		if(itMAnnotations != m_mapMAnnotations.end()) {
			m_mapMAnnotations.erase(itMAnnotations);
			type = TT_ANNOTATION;
		} else {
			SAnnotationBMap::iterator	itSBAnnotations = m_mapSAnnotationBs.find(nCookie);
			if(itSBAnnotations != m_mapSAnnotationBs.end()) {
				m_mapSAnnotationBs.erase(itSBAnnotations);
				type = TT_ANNOTATION;
			} else {
				SAnnotationDMap::iterator	itSDAnnotations = m_mapSAnnotationDs.find(nCookie);
				if(itSDAnnotations != m_mapSAnnotationDs.end()) {
					m_mapSAnnotationDs.erase(itSDAnnotations);
					type = TT_ANNOTATION;
				}
			}
		}
	}
	if(type == TT_COUNT)
		throw invalid_argument("Specified cookie value is invalid.");
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerRemovedIdentifiedToken(type, nCookie);
	NotifyChange();
}

///	SĂ̐ݒ荀ڂԂɖ߂
void CLexer::Reset() {
	AssertValid();

	m_bIgnoreCase = false;
	m_bEnableUnicodeAlphabets = true;
	m_bEnableUnicodeWhiteSpaces = true;
	m_numberFormat = NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL;
	for(size_t i = 0; i < TT_COUNT; ++i)
		m_arrEnabledTokenTypes[i] = true;
	RemoveAll();
}

/**
 *	̃At@xbgȊOɁAP\ƂĂ݂ȂR[h|Cgݒ肷
 *	@param pwszAlphabets	ׂBLȃTQ[gyA͔ BMP ɕϊ
 *	@param cch				<var>pwszAlphabets</var> ̕
 */
void CLexer::SetAdditionalAlphabets(const char_t* pwszAlphabets, length_t cch) {
	AssertValid();

	m_listAdditionalAlphabets.clear();
	for(length_t i = 0; i < cch; ++i) {
		const CodePoint	cp = DecodeUTF16SurrogatePairToCodePoint(pwszAlphabets + i, cch - i);

		m_listAdditionalAlphabets.push_back(cp);
		if(cp > 0xFFFF)
			++i;
	}
	m_listAdditionalAlphabets.sort();
	NotifyChange();
}

/**
 *	̃At@xbgȊOɁAP\ƂĂ݂ȂR[h|Cgݒ肷
 *	@param setAlphabets	R[h|Cg̏W
 */
void CLexer::SetAdditionalAlphabets(const set<CodePoint>& setAlphabets) {
	AssertValid();

	m_listAdditionalAlphabets.clear();
	copy(setAlphabets.begin(), setAlphabets.end(), m_listAdditionalAlphabets.begin());
	NotifyChange();
}

/**
 *	ʂƂĎgp镶ݒ肷
 *	@param pwszBrackets				Jʂׂ
 *	@throw std::invalid_argument	JʂƂĎgpłȂɊ܂܂ĂƂX[
 */
void CLexer::SetBrackets(const char_t* pwszBrackets) {
	AssertValid();
	assert(pwszBrackets != 0);

	ostringstream_t	ss;

	for(size_t i = 0; pwszBrackets[i] != 0; ++i) {
		for(size_t j = 0; ; ++j) {
			if(garrBracketPairs[j].first == 0 || garrBracketPairs[j].second == pwszBrackets[i])
				throw invalid_argument("Specified character can not be used as an opener.");
			else if(garrBracketPairs[j].first == pwszBrackets[i])
				break;
		}
		ss << garrBracketPairs[i].first << garrBracketPairs[i].second;
	}
	const string_t	str = ss.str();
	delete[] m_pwszBrackets;
	m_pwszBrackets = new char_t[str.length() + 1];
	wcscpy(m_pwszBrackets, str.c_str());
}

/**
 *	Zqݒ肷
 *	@param setOperators	Zq̏W
 */
void CLexer::SetOperators(const set<string_t>& setOperators) {
	AssertValid();

	m_mapOperators.clear();
	for(set<string_t>::const_iterator it =
			setOperators.begin(); it != setOperators.end(); ++it) {
		if(it->empty())
			continue;
		m_mapOperators[it->at(0)].push_back(*it);
	}
	for(map<char_t, OperatorList>::iterator it = m_mapOperators.begin(); it != m_mapOperators.end(); ++it)
		it->second.sort(gCompareLength);
	NotifyChange();
}

/* [EOF] */