// Lexer.cpp
// (c) 2004-2006 exeal

#include "StdAfx.h"
#include "Lexer.h"
#include "TextSearcher.h"	// CBoundarySearcher::IsGraphemeBase

using namespace Ascension;
using namespace std;


namespace {
#if ASCENSION_UNICODE_VERSION != 0x0410
#error This array is based on old version of Unicode.
#endif
	const pair<char_t, char_t>	bracketPairs[] = {	//  Ps APe  XML p '<'  '>'
		make_pair(0x0028, 0x0029),	// Parenthesis
		make_pair(0x003C, 0x003E),	// [for XML] Less-Than/Greater-Than Sign
		make_pair(0x005B, 0x005D),	// Square Bracket
		make_pair(0x007B, 0x007D),	// Curly Bracket
		make_pair(0x0F3A, 0x0F3B),	// Tibetan Mark Gug Rtags Gyon and Gyas
		make_pair(0x0F3C, 0x0F3D),	// Tibetan Mark Ang Khang Gyon and Gyas
		make_pair(0x169B, 0x169C),	// Ogham Feather Maek and reversed one
//		make_pair(0x201A, 0x????),	// Single Low-9 Quotation Mark
//		make_pair(0x201E, 0x????),	// Double Low-9 Quotation Mark
		make_pair(0x2045, 0x2046),	// Square Bracket With Quill
		make_pair(0x207D, 0x207E),	// Superscript Parenthesis
		make_pair(0x208D, 0x208E),	// Subscript Parenthesis
		make_pair(0x2329, 0x232A),	// Pointing Angle Bracket
		make_pair(0x23B4, 0x23B5),	// Square Bracket (top/bottom)
		make_pair(0x2768, 0x2769),	// Medium Parenthesis Ornament
		make_pair(0x276A, 0x276B),	// Medium Flattened Parenthesis Ornament
		make_pair(0x276C, 0x276D),	// Medium Pointing Angle Bracket Ornament
		make_pair(0x276E, 0x276F),	// Heavy Pointing Angle Quotation Mark Ornament
		make_pair(0x2770, 0x2771),	// Heavy Pointing Angle Bracket Ornament
		make_pair(0x2772, 0x2773),	// Light Tortoise Shell Bracket Ornament
		make_pair(0x2774, 0x2775),	// Medium Curly Bracket Ornament
		make_pair(0x27C5, 0x27C6),	// S-Shaped Bag Delimiter
		make_pair(0x27E6, 0x27E7),	// Mathematical White Square Bracket
		make_pair(0x27E8, 0x27E9),	// Mathematical Angle Bracket
		make_pair(0x27EA, 0x27EB),	// Mathematical Double Angle Bracket
		make_pair(0x2983, 0x2984),	// White Curly Barcket
		make_pair(0x2985, 0x2986),	// White Parenthesis
		make_pair(0x2987, 0x2988),	// Z Notation Image Bracket
		make_pair(0x2989, 0x298A),	// Z Notation Binding Bracket
		make_pair(0x298B, 0x298C),	// Square Bracket With Underbar
		make_pair(0x298D, 0x298E),	// Left Square Bracket With Tick In Top Corner and Right ... Bottom
		make_pair(0x298F, 0x2990),	// Left Square Bracket With Tick In Bottom Corner and Right ... Top
		make_pair(0x2991, 0x2992),	// Angle Bracket With Dot
		make_pair(0x2993, 0x2994),	// Arc Less-Than Bracket
		make_pair(0x2995, 0x2996),	// Double Arc Greater-Than Bracket
		make_pair(0x2997, 0x2998),	// Black Tortoise Shell Bracket
		make_pair(0x29D8, 0x29D9),	// Wiggly Fence
		make_pair(0x29DA, 0x29DB),	// Double Wiggly Fence
		make_pair(0x29FC, 0x29FD),	// Pointing Curved Angle Bracket
		make_pair(0x3008, 0x3009),	// Angle Bracket
		make_pair(0x300A, 0x300B),	// Double Angle Bracket
		make_pair(0x300C, 0x300D),	// Corner Bracket
		make_pair(0x300E, 0x300F),	// White Corner Bracket
		make_pair(0x3010, 0x3011),	// Black Lenticular Bracket
		make_pair(0x3014, 0x3015),	// Tortoise Shell Bracket
		make_pair(0x3016, 0x3017),	// White Lenticular Bracket
		make_pair(0x3018, 0x3019),	// White Tortoise Shell Bracket
		make_pair(0x301A, 0x301B),	// White Square Bracket
		make_pair(0x301D, 0x301F),	// Double Prime Quotation Mark and reversed one
//		make_pair(0x????, 0x301E),	// Double Prime Quotation Mark (deprecated: mistaken analogue)
		make_pair(0xFD3E, 0xFD3F),	// Ornate Parenthesis
		make_pair(0xFE17, 0xEF18),	// Presentation Form For Vertical Left White Lenticular Bracket
		make_pair(0xFE35, 0xFE36),	// Presentation Form For Vertical Parenthesis
		make_pair(0xFE37, 0xFE38),	// - Curly Bracket
		make_pair(0xFE39, 0xFE3A),	// - Tortoise Shell Bracket
		make_pair(0xFE3B, 0xFE3C),	// - Black Lenticular Bracket
		make_pair(0xFE3D, 0xFE3E),	// - Double Angle Bracket
		make_pair(0xFE3F, 0xFE40),	// - Angle Bracket
		make_pair(0xFE41, 0xFE42),	// - Corner Bracket
		make_pair(0xFE43, 0xFE44),	// - White Corner Bracket
		make_pair(0xFE45, 0xFE46),	// Sesame Dot and White one
		make_pair(0xFE47, 0xFE48),	// - Square Bracket
		make_pair(0xFE59, 0xFE5A),	// Small Parenthesis
		make_pair(0xFE5B, 0xFE5C),	// Small Curly Bracket
		make_pair(0xFE5D, 0xFE5E),	// Small Tortoise Shell Bracket
		make_pair(0xFF08, 0xFF09),	// Fullwidth Parenthesis
		make_pair(0xFF3B, 0xFF3D),	// Fullwidth Square Bracket
		make_pair(0xFF5B, 0xFF5D),	// Fullwidth Curly Bracket
		make_pair(0xFF5F, 0xFF60),	// Fullwidth White Parenthesis
		make_pair(0xFF62, 0xFF63),	// Halfwidth Corner Bracket
		make_pair(0x0000, 0x0000)	// _~[
	};

	// wcsncmp 
	inline bool isEqualString(const char_t* p1, const char_t* p2, size_t length) {
		assert(p1 != 0 && p2 != 0 && length != 0);
#ifdef _DEBUG
		return p1[0] == p2[0] && (length == 1 || wcsncmp(p1 + 1, p2 + 1, length - 1) == 0);
#else
		return memcmp(p1, p2, sizeof(char_t) * length) == 0;
#endif /* _DEBUG */
	}
}


// URIDetector class implementation
/////////////////////////////////////////////////////////////////////////////

/**
 *	񂪃[AhX𒲂ׂ
 *	@param first, last	ׂ镶
 *	@param asIRI		UCS F߂邩 ()
 *	@return				[AhX̏I[
 */
const char_t* URIDetector::eatMailAddress(const char_t* first, const char_t* last, bool) {
	// ̃\bh "/[\w\d][\w\d\.\-_]*@[\w\d\-_]+(\.[\w\d\-_]+)+/" ̂悤ȃp^[}b`s
#define IS_ALNUM(ch)					\
	(((ch) >= L'A' && (ch) <= L'Z')		\
	|| ((ch) >= L'a' && (ch) <= L'z')	\
	|| ((ch) >= L'0' && (ch) <= L'9'))
#define IS_ALNUMBAR(ch)	\
	(IS_ALNUM(ch) || ch == L'-' || ch == L'_')

	if(last - first < 5)
		return first;

	// 1
	if(!IS_ALNUM(*first))
		return first;

	// 2ڂ '@'
	const char_t* const originalFirst = first++;
	for(; first < last - 3; ++first) {
		if(!IS_ALNUMBAR(*first) && *first != L'.')
			break;
	}
	if(*first != L'@' || last - first == 3)
		return originalFirst;

	// '@' ̌
	const char_t* const	atMark = first;
	bool				dotAppeared = false;
	for(first = atMark + 1; first < last; ++first) {
		if(IS_ALNUMBAR(*first))
			continue;
		else if(*first == L'.') {
			if(first[-1] == L'.')
				return originalFirst;
			dotAppeared = true;
		} else
			break;
	}
	return (dotAppeared && (first - atMark > 2)) ? first : originalFirst;
}

/**
 *	@brief URL 𒲂ׂ
 *
 *	_ł͈ȉ̕ URL ̊JnƂ݂Ȃ:
 *	<ul>
 *		<li>file://</li><li>ftp://</li><li>gopher://</li><li>http://</li><li>https://</li>
 *		<li>mailto://</li><li>news://</li><li>nntp://</li><li>telnet://</li><li>wais://</li>
 *	</ul>
 *
 *	@param first, last	ׂ镶
 *	@param asIRI		UCS F߂邩 ()
 *	@return				URL ̏I[
 */
const char_t* URIDetector::eatURL(const char_t* first, const char_t* last, bool) {
#define STARTS_WITH(prefix, len)	\
	(len < last - first && isEqualString(first, prefix, len) && (urlLength = len - 1))

	static const bool urlChars[] = {	// URI \
		false,	false,	false,	false,	false,	false,	false,	false,	// 0x00
		false,	false,	false,	false,	false,	false,	false,	false,
		false,	false,	false,	false,	false,	false,	false,	false,	// 0x10
		false,	false,	false,	false,	false,	false,	false,	false,
		false,	true,	false,	true,	true,	true,	true,	false,	// 0x20
		false,	false,	false,	true,	true,	true,	true,	true,
		true,	true,	true,	true,	true,	true,	true,	true,	// 0x30
		true,	true,	true,	true,	false,	true,	false,	true,
		true,	true,	true,	true,	true,	true,	true,	true,	// 0x40
		true,	true,	true,	true,	true,	true,	true,	true,
		true,	true,	true,	true,	true,	true,	true,	true,	// 0x50
		true,	true,	true,	false,	true,	false,	false,	true,
		false,	true,	true,	true,	true,	true,	true,	true,	// 0x60
		true,	true,	true,	true,	true,	true,	true,	true,
		true,	true,	true,	true,	true,	true,	true,	true,	// 0x70
		true,	true,	true,	false,	false,	false,	true,	false
	};

	if(!urlChars[first[0] & 0x00FF] || last - first < 6)
		return first;
	length_t urlLength;
	if(STARTS_WITH(L"file://", 7)
			|| STARTS_WITH(L"ftp://", 6)
			|| STARTS_WITH(L"gopher://", 9)
			|| STARTS_WITH(L"http://", 7)
			|| STARTS_WITH(L"https://", 8)
			|| STARTS_WITH(L"mailto://", 9)
			|| STARTS_WITH(L"news://", 7)
			|| STARTS_WITH(L"nntp://", 7)
			|| STARTS_WITH(L"telnet://", 9)
			|| STARTS_WITH(L"wais://", 7)) {
		for(++urlLength; urlLength < static_cast<string_t::size_type>(last - first); ++urlLength) {
			if(first[urlLength] > 0x007F || !urlChars[first[urlLength] & 0x00FF])
				return first + urlLength;
		}
		return last;
	}
	return first;

#undef STARTS_WITH
}


// Lexer class implementation
/////////////////////////////////////////////////////////////////////////////

/// ʓIȊJʂ̃Xg
const char_t Lexer::ASCII_OPENERS[] = L"([{";
/// Unicode ̊Jʂ̃Xg
const char_t Lexer::UNICODE_OPENERS[] = L"([{\x0F3A\xF3C\x169B\x2045\x207D\x208D\x2329\x23B4"
											L"\x2768\x276A\x276C\x276E\x2770\x2772\x2774\x27C5"
											L"\x27E6\x27E8\x27EA\x2983\x2985\x2987\x2989\x298B"
											L"\x298D\x298F\x2991\x2993\x2995\x2997\x29D8\x29DA"
											L"\x29FC\x3008\x300A\x300C\x300E\x3010\x3014\x3016"
											L"\x3018\x301A\x301D\xFD3E\xFE35\xFE37\xFE39\xFE3B"
											L"\xFE3D\xFE3F\xFE41\xFE43\xFE45\xFE47\xFE59\xFE5B"
											L"\xFE5D\xFF08\xFF3B\xFF5B\xFF5F\xFF62";

const Token::Cookie Token::NULL_COOKIE = 0;
Token::Cookie Lexer::nextCookie_ = Token::NULL_COOKIE + 1;

/**
 *	RXgN^
 *	@param eventListener Cxgnh (null ł悢)
 */
Lexer::Lexer(Lexer::IEventListener* eventListener) :
		freezed_(false), caseSensitive_(true), escapeByBackSolidus_(true),
		enableUnicodeAlphabets_(true), enableUnicodeWhiteSpaces_(true),
		numberFormat_(NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL),
		brackets_(0), eventListener_(eventListener) {
	fill(enabledTokenTypes_, endof(enabledTokenTypes_), true);
	setBrackets(Lexer::ASCII_OPENERS);
}

/// fXgN^
Lexer::~Lexer() {
	clearKeywords();
	delete[] brackets_;
}

/**
 *	L[[h`ǉ
 *	@param keywords	ǉL[[h̏W
 *	@return			NbL[
 */
Token::Cookie Lexer::addKeywords(const set<string_t>& keywords) {
	assertValid();

	HashTable* h = new HashTable(keywords, caseSensitive_);

	keywords_.insert(make_pair(Lexer::nextCookie_, h));
	if(eventListener_ != 0)
		eventListener_->onLexerAddedIdentifiedToken(Token::KEYWORD, Lexer::nextCookie_);
	notifyChange();
	return Lexer::nextCookie_++;
}

/**
 *	sRg`ǉ
 *	@param startDelimiter	Jn
 *	@param endDelimiter		I
 *	@param constraint		Jnf~^̐
 *	@return					NbL[
 */
Token::Cookie Lexer::addMultilineAnnotation(const string_t& startDelimiter,
		const string_t& endDelimiter, AnnotationConstraint constraint /* = AC_NONE */) {
	assertValid();

	const MultilineAnnotation annotation = {startDelimiter, endDelimiter, constraint};

	multilineAnnotations_.insert(make_pair(Lexer::nextCookie_, annotation));
	if(eventListener_ != 0)
		eventListener_->onLexerAddedIdentifiedToken(Token::ANNOTATION, Lexer::nextCookie_);
	notifyChange();

	return Lexer::nextCookie_++;
}

/**
 *	sŏIPsߒ`ǉ
 *	@param startDelimiter	Jn
 *	@param constraint		Jnf~^̐
 *	@return					NbL[
 */
Token::Cookie Lexer::addSinglelineAnnotation(const string_t& startDelimiter, AnnotationConstraint constraint /* = AC_NONE */) {
	assertValid();

	const SinglelineAnnotationEndedByBreak annotation = {startDelimiter, constraint};

	singlelineAnnotationBs_.insert(make_pair(Lexer::nextCookie_, annotation));
	if(eventListener_ != 0)
		eventListener_->onLexerAddedIdentifiedToken(Token::ANNOTATION, Lexer::nextCookie_);
	notifyChange();

	return Lexer::nextCookie_++;
}

/**
 *	sŏIPsߒ`ǉ
 *	@param startDelimiter	Jn
 *	@param endDelimiter		I
 *	@param constraint		Jnf~^̐
 *	@return					NbL[
 */
Token::Cookie Lexer::addSinglelineAnnotation(const string_t& startDelimiter,
		const string_t& endDelimiter, AnnotationConstraint constraint /* = AC_NONE */) {
	assertValid();

	const SinglelineAnnotationEndedByDelimiter annotation = {startDelimiter, endDelimiter, constraint};

	singlelineAnnotationDs_.insert(make_pair(Lexer::nextCookie_, annotation));
	if(eventListener_ != 0)
		eventListener_->onLexerAddedIdentifiedToken(Token::ANNOTATION, Lexer::nextCookie_);
	notifyChange();

	return Lexer::nextCookie_++;
}

/// L[[hSč폜
void Lexer::clearKeywords() {
	for(KeywordsMap::iterator it = keywords_.begin(); it != keywords_.end(); ++it)
		delete it->second;
	keywords_.clear();
}

template<> inline const char_t* Lexer::doEatNumbers<NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL>(const char_t* first, const char_t* last) const {
	if(!toBoolean(iswdigit(*first)))
		return first;

	CodePoint cp;
	for(; first < last; ++first) {
		if(first < last - 1
				&& UTF16Surrogates::isHighSurrogate(first[0])
				&& UTF16Surrogates::isLowSurrogate(first[1]))
			cp = UTF16Surrogates::decode(first, last - first);
		else
			cp = *first;
		if(*first == L'.' || isIdentifierContinueCodePoint(cp)) {
			if(cp > 0xFFFF)
				++first;
		}
		else
			return first;
	}
	return last;
}

/**
 *	L[[h𒲂ׂ
 *	@param first, last	ׂ镶
 *	@param cookie		[out] L[[hɊ֘AtꂽNbL[l
 *	@return				L[[hłꍇ true
 */
bool Lexer::eatKeyword(const char_t* first, const char_t* last, Token::Cookie& cookie) const {
	assertValid();

	for(KeywordsMap::const_iterator it = keywords_.begin(); it != keywords_.end(); ++it) {
		if(it->second->find(first, last)) {
			cookie = it->first;
			return true;
		}
	}
	return false;
}

/**
 *	sߕ𒲂ׂ (JnAIɓ)
 *	@param first, last	ׂ镶
 *	@param constraint	ׂ悤ƂĂ镶񂪖Ă鐧
 *	@param cookie		[in, out] ׂ镶̒OsRgłꍇ
 *						̃RgɊ֘AtꂽNbL[ݒ肵ĂB\bhďoA
 *						񂪕sRgł΂Ɋ֘AtꂽNbL[̒lݒ肳
 *						(o͂ɂ Token::NULL_COOKIE ͕sRgłȂƂ\̂Ɏg)
 *	@param continued	s߂̍sŏIĂȂ true
 *	@return				s߂ł΂̒B߂łȂ 0
 *	@throw std::invalid_argument	@a cookie ȂƂX[
 */
const char_t* Lexer::eatMultilineAnnotation(const char_t* first, const char_t* last,
		AnnotationConstraint constraint, Token::Cookie& cookie, bool& continued) const {
	assertValid();

	continued = false;
	if(cookie == Token::NULL_COOKIE) {	// s߂̊Jnf~^T
		for(MAnnotationMap::const_iterator it =
				multilineAnnotations_.begin(); it != multilineAnnotations_.end(); ++it) {
			if(((it->second.constraint & constraint) != it->second.constraint)
					|| it->second.startDelimiter.length() > static_cast<string_t::size_type>(last - first))
				continue;
			else if(isEqualString(first, it->second.startDelimiter.data(), it->second.startDelimiter.length())) {
				cookie = it->first;
				first += it->second.startDelimiter.length();
				break;
			}
		}
		if(cookie == Token::NULL_COOKIE)
			return first;	// <-  first 瓮ĂȂ
	}
	continued = true;
	
	// sRg̏If~^T
	MAnnotationMap::const_iterator it = multilineAnnotations_.find(cookie);

	if(it == multilineAnnotations_.end())
		throw invalid_argument("Input cookie value is invalid.");

	const char_t* end = search(first, last,
		it->second.endDelimiter.begin(), it->second.endDelimiter.end());	// [
	if(end == last)
		return last;
	continued = false;
	return end + it->second.endDelimiter.length();
}

/**
 *	l𒲂ׂ
 *	@param first, last	ׂ镶
 *	@return				l̏I[
 */
const char_t* Lexer::eatNumbers(const char_t* first, const char_t* last) const {
	assertValid();

	switch(numberFormat_) {
	case NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL:	return doEatNumbers<NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL>(first, last);
/*	case NF_CPLUSPLUS:							return doEatNumbers<NF_CPLUSPLUS>(first, last);
	case NF_PERL:								return doEatNumbers<NF_PERL>(first, last);
	case NF_RUBY:								return doEatNumbers<NF_RUBY>(first, last);
	case NF_VBSCRIPT:							return doEatNumbers<NF_VBSCRIPT>(first, last);
	case NF_JAVASCRIPT_15:						return doEatNumbers<NF_JAVASCRIPT_15>(first, last);
	case NF_JAVASCRIPT_20:						return doEatNumbers<NF_JAVASCRIPT_20>(first, last);
*/	default:									assert(false);
	}
	return first;
}

/**
 *	Zq𒲂ׂ
 *	@param first, last	ׂ镶
 *	@return				Zq̏I[
 */
const char_t* Lexer::eatOperators(const char_t* first, const char_t* last) const {
	assertValid();

	const OperatorMap::const_iterator it = operators_.find(*first);

	if(it == operators_.end())
		return first;

	const OperatorSet& operators = it->second;
	for(OperatorSet::const_iterator it = operators.begin(); it != operators.end(); ++it) {
		if(it->length() > static_cast<string_t::size_type>(last - first))
			continue;
		else if(isEqualString(first, it->data(), it->length()))
			return first + it->length();
	}
	return first;
}

/**
 *	Ps߂𒲂ׂ
 *	@param first, last	ׂ镶
 *	@param constraint	ׂ悤ƂĂ镶񂪖Ă鐧
 *	@param cookie		[out] PsRgɊ֘AtꂽNbL[l
 *	@return				Ps߂̏I[
 */
const char_t* Lexer::eatSinglelineAnnotation(const char_t* first, const char_t* last,
		AnnotationConstraint constraint, Token::Cookie& cookie) const {
	assertValid();

	// sŏI钍߂
	for(SAnnotationBMap::const_iterator it =
			singlelineAnnotationBs_.begin(); it != singlelineAnnotationBs_.end(); ++it) {
		if(((it->second.constraint & constraint) != it->second.constraint)
				|| it->second.startDelimiter.length() > static_cast<string_t::size_type>(last - first))
			continue;
		else if(isEqualString(first, it->second.startDelimiter.data(), it->second.startDelimiter.length())) {
			cookie = it->first;
			return last;
		}
	}

	// wf~^ŏI钍߂
	SAnnotationDMap::const_iterator itD;
	for(itD = singlelineAnnotationDs_.begin(); itD != singlelineAnnotationDs_.end(); ++itD) {
		if(((itD->second.constraint & constraint) != itD->second.constraint)
				|| itD->second.startDelimiter.length() > static_cast<string_t::size_type>(last - first))
			continue;
		else if(isEqualString(first, itD->second.startDelimiter.data(), itD->second.startDelimiter.length())) {
			cookie = itD->first;
			break;
		}
	}
	if(itD == singlelineAnnotationDs_.end())
		return first;
	const char_t* const end = search(first + itD->second.startDelimiter.length(), last,
								itD->second.endDelimiter.begin(), itD->second.endDelimiter.end());
	return (end < last) ? end + itD->second.endDelimiter.length() : last;
}

/**
 *	w肳ꂽʂ̓Ԃ
 *	@param bracketChar	ׂ銇
 *	@param pairChar		[out] Ί
 *	@param opener		[out] @a bracketChar JʂȂ true
 *	@return				@a bracketChar gp\ȊʂɊ܂܂Ă true
 */
bool Lexer::getBracketTraits(char_t bracketChar, char_t& pairChar, bool& opener) const {
	if(const char_t* found = wcschr(brackets_, bracketChar)) {
		if((found - brackets_) % 2 == 0) {
			pairChar = found[1];
			opener = true;
		} else {
			pairChar = found[-1];
			opener = false;
		}
		return true;
	}
	return false;
}

/**
 *	L[[h̑啶ʂ邩ǂ̐ݒB
 *	ݒύXƓo^ĂL[[h͑Sč폜
 *	@param ignore ʂȂꍇ true
 */
void Lexer::ignoreCase(bool ignore) {
	assertValid();
	if(ignore == caseSensitive_) {
		caseSensitive_ = !ignore;
		keywords_.clear();
		notifyChange();
	}
}

/**
 *	ʎq\𔻒肷B̃\bh
 *	Unicode  ID_Continue vpeB̃XgɊÂĂ
 *	@param cp	ׂR[h|Cg
 *	@return		ʎq\ǂ
 *	@see		Lexer::isIdentifier, Lexer::isIdentifierStartChar
 */
bool Lexer::isIdentifierContinueCodePoint(CodePoint cp) const {
	assertValid();

	if(enableUnicodeAlphabets_) {
		if(isIdentifierStartCodePoint(cp))
			return true;
		const CharProperty::GeneralCategory	gc = CharProperty::getGeneralCategory(cp);
		return gc == CharProperty::GC_MARK_NONSPACING
			|| gc == CharProperty::GC_MARK_SPACING_COMBINING
			|| gc == CharProperty::GC_NUMBER_DECIMAL_DIGIT
			|| gc == CharProperty::GC_PUNCTUATION_CONNECTOR
			|| CharProperty::hasBinaryProperty<CharProperty::BP_OTHER_ID_CONTINUE>(cp);
	} else
		return isIdentifierStartCodePoint(cp) || (cp == L'_') || (cp >= L'0' && cp <= L'9');
}

/**
 *	ʎqJn𔻒肷B̃\bh
 *	Unicode  ID_Start vpeB̃XgɊÂĂ
 *	@param cp	ׂR[h|Cg
 *	@return		ʎqJnǂ
 *	@see		Lexer::isIdentifier, Lexer::isIdentifierContinueChar
 */
bool Lexer::isIdentifierStartCodePoint(CodePoint cp) const {
	assertValid();

	if(binary_search(additionalAlphabets_.begin(), additionalAlphabets_.end(), cp))
		return true;
	if(enableUnicodeAlphabets_) {
		const CharProperty::GeneralCategory	gc = CharProperty::getGeneralCategory(cp);
		return gc == CharProperty::GC_LETTER_LOWERCASE
			|| gc == CharProperty::GC_LETTER_MODIFIER
			|| gc == CharProperty::GC_LETTER_OTHER
			|| gc == CharProperty::GC_LETTER_TITLECASE
			|| gc == CharProperty::GC_LETTER_UPPERCASE
			|| CharProperty::hasBinaryProperty<CharProperty::BP_OTHER_ID_START>(cp);
	} else
		return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z');
}

/**
 *	͂Ag[Nɕ
 *	@param first, last	
 *	@param cookie		[in, out] ׂ镶̒OsRgłꍇ
 *						̃RgɊ֘AtꂽNbL[ݒ肵ĂB\bhďoA
 *						ׂ̖̕sRgɊ֘AtꂽNbL[̒lݒ肳
 *						(o͂ɂ Token::NULL_COOKIE ͕sRgłȂƂ\̂Ɏg)
 *	@param tokens		[out] g[ÑXg
 */
void Lexer::parse(const char_t* first, const char_t* last, Token::Cookie& cookie, list<Token>& tokens) const {
	assertValid();

	const char_t* const originalFirst = first;
	const char_t* tokenEnd;
	Token token;
	bool multilineCommentContinued;

	token.index_ = 0;
	if(cookie != Token::NULL_COOKIE) {	// ׂ镶̑O畡sRgĂꍇ
		token.setCookie(cookie);
		tokenEnd = eatMultilineAnnotation(first, last, AC_ONLYSTARTOFLINE | AC_ONLYFIRSTCHAR, cookie, multilineCommentContinued);
		token.setType(Token::ANNOTATION);
		tokens.push_back(token);
		token.index_ = tokenEnd - first;
		if(!multilineCommentContinued)
			cookie = Token::NULL_COOKIE;
	}
	token.setCookie(Token::NULL_COOKIE);
	first += token.index_;

	Token::Cookie currentCookie = Token::NULL_COOKIE;
	AnnotationConstraint ac;
	bool appearedUnspace = false;
	while(first < last) {
		// ⑫K̓K
		ac = AC_NONE;
		if(first == originalFirst)	ac |= AC_ONLYSTARTOFLINE;
		if(!appearedUnspace)		ac |= AC_ONLYFIRSTCHAR;

		// ^u
		if(enabledTokenTypes_[Token::TAB] && *first == L'\t') {	// ^u
			tokenEnd = first + 1;
			token.setType(Token::TAB);
		}

		// 󔒗ޕ
		else if(enabledTokenTypes_[Token::WHITESPACE] && first != (tokenEnd = eatWhiteSpaces(first, last, false)))
			token.setType(Token::WHITESPACE);

		// l
		else if(enabledTokenTypes_[Token::NUMBER] && first != (tokenEnd = eatNumbers(first, last)))
			token.setType(Token::NUMBER);

		// dp
		else if(enabledTokenTypes_[Token::SINGLE_QUOTATION]
				&& *first == L'\'' && first != (tokenEnd = eatQuotation(first, last, escapeByBackSolidus_)))
			token.setType(Token::SINGLE_QUOTATION);

		// dp
		else if(enabledTokenTypes_[Token::DOUBLE_QUOTATION]
				&& *first == L'\"' && first != (tokenEnd = eatQuotation(first, last, escapeByBackSolidus_)))
			token.setType(Token::DOUBLE_QUOTATION);

		// ̑̈p
		else if(enabledTokenTypes_[Token::OTHER_QUOTATION]
				&& *first != L'\'' && *first != L'\"'
				&& first != (tokenEnd = eatQuotation(first, last, escapeByBackSolidus_)))
			token.setType(Token::OTHER_QUOTATION);

		// Ps
		else if(enabledTokenTypes_[Token::ANNOTATION] && first != (tokenEnd = eatSinglelineAnnotation(first, last, ac, currentCookie))) {
			token.setCookie(currentCookie);
			token.setType(Token::ANNOTATION);
		}

		// s
		else if(enabledTokenTypes_[Token::ANNOTATION]
				&& first != (tokenEnd = eatMultilineAnnotation(first, last, ac, currentCookie, multilineCommentContinued))) {
			token.setCookie(currentCookie);
			token.setType(Token::ANNOTATION);
			if(multilineCommentContinued)	// ̍sɑ
				cookie = currentCookie;
		}

		// Zq
		else if(enabledTokenTypes_[Token::OPERATOR] && first != (tokenEnd = eatOperators(first, last)))
			token.setType(Token::OPERATOR);

		// 
		else if(enabledTokenTypes_[Token::NUMERAL] && first != (tokenEnd = eatNumerals(first, last)))
			token.setType(Token::NUMERAL);

		// ASCII 䕶
		else if(enabledTokenTypes_[Token::ASCII_CONTROL] && first != (tokenEnd = eatASCIIControls(first, last)))
			token.setType(Token::ASCII_CONTROL);

		// Unicode 䕶
		else if(enabledTokenTypes_[Token::UNICODE_CONTROL] && eatUnicodeControls(first, last)) {
			tokenEnd = first + 1;
			token.setType(Token::UNICODE_CONTROL);
		}

		// ʎq or L[[h (or )
		else if(enabledTokenTypes_[Token::IDENTIFIER] && first != (tokenEnd = eatIdentifier(first, last))) {
			if(keywords_.empty())
				token.setType(Token::IDENTIFIER);
			else if(eatKeyword(first, tokenEnd, currentCookie)) {
				token.setCookie(currentCookie);
				token.setType(Token::KEYWORD);
			} else
				token.setType(Token::IDENTIFIER);
		}

		// 
		else {
			CodePoint cp;
			tokenEnd = first;
			while(tokenEnd < last) {
				cp = UTF16Surrogates::decode(tokenEnd, last - tokenEnd);
				tokenEnd += (cp > 0xFFFF) ? 2 : 1;
				if(!BoundaryDetector::isGraphemeExtend(cp))
					break;
			}
			token.setType(Token::UNSPECIFIED);
		}

		if(tokens.empty() || token.getType() != Token::UNSPECIFIED || token.getType() != tokens.back().getType()
				|| (!tokens.empty() && (first - originalFirst) - tokens.back().getIndex() == 1	// LȊʂ1g[N
				&& (wcschr(brackets_, first[-1]) != 0 || wcschr(brackets_, *first) != 0))) {
			token.index_ = first - originalFirst;
			tokens.push_back(token);
			currentCookie = Token::NULL_COOKIE;
			token.setCookie(Token::NULL_COOKIE);
		} else
			/* Aȃ̂߂Ɏނނ̃g[N1ɂ܂Ƃ߂ */;
		first = tokenEnd;

		if(token.getType() != Token::WHITESPACE && token.getType() != Token::TAB)
			appearedUnspace = true;
	}
}

/**
 *	̕s߂̏Ԃ𒲂ׂ
 *	@param first, last	
 *	@param cookie		ׂ镶̒O瑱sRgɊ֘AtꂽNbL[B
 *						s߂łȂꍇ Token::NULL_COOKIE
 *	@return				ׂ񂪕sIĂȂ΁A
 *						̕s߂Ɋ֘AtꂽNbL[BłȂ Token::NULL_COOKIE
 */
Token::Cookie Lexer::parseMultilineAnnotation(const char_t* first, const char_t* last, Token::Cookie cookie) const {
	assertValid();

	if(first == last)
		return cookie;

	const char_t* const originalFirst = first;
	bool multilineCommentContinued;

	// s߂O̍s瑱Ăꍇ
	if(cookie != Token::NULL_COOKIE) {
		first = eatMultilineAnnotation(first, last, AC_ONLYSTARTOFLINE | AC_ONLYFIRSTCHAR, cookie, multilineCommentContinued);
		if(multilineCommentContinued)
			return cookie;
		cookie = Token::NULL_COOKIE;
	}

	const char_t* tokenEnd;
	Token::Cookie dummyCookie;

	while(first < last) {
		const AnnotationConstraint ac = (first == originalFirst) ? AC_ONLYSTARTOFLINE | AC_ONLYFIRSTCHAR : AC_NONE;
		if(enabledTokenTypes_[Token::ANNOTATION]) {
			if(first != (tokenEnd = eatSinglelineAnnotation(first, last, ac, dummyCookie)))
				return Token::NULL_COOKIE;
			else if(first != (tokenEnd = eatMultilineAnnotation(first, last, ac, cookie, multilineCommentContinued))) {
				if(multilineCommentContinued)
					return cookie;
				cookie = Token::NULL_COOKIE;
				first = tokenEnd;
				continue;
			}
		}
		if((enabledTokenTypes_[Token::SINGLE_QUOTATION]
				|| enabledTokenTypes_[Token::DOUBLE_QUOTATION]
				|| enabledTokenTypes_[Token::OTHER_QUOTATION])
				&& first != (tokenEnd = eatQuotation(first, last, escapeByBackSolidus_)))
			first = tokenEnd;
		else
			++first;
	}
	return Token::NULL_COOKIE;
}

/// o^ĂL[[hA߁AZqSč폜
void Lexer::removeAll() {
	assertValid();
	clearKeywords();
	multilineAnnotations_.clear();
	singlelineAnnotationBs_.clear();
	singlelineAnnotationDs_.clear();
	operators_.clear();
	notifyChange();
}

/**
 *	@c addXxxx œo^g[N폜
 *	@param cookie					g[ÑNbL[l
 *	@throw std::invalid_argument	NbL[lsȂƂX[
 */
void Lexer::removeIdentifiedToken(Token::Cookie cookie) {
	assertValid();

	Token::Type type = Token::COUNT;

	KeywordsMap::iterator itKeywords = keywords_.find(cookie);
	if(itKeywords != keywords_.end()) {
		delete itKeywords->second;
		keywords_.erase(itKeywords);
		type = Token::KEYWORD;
	} else {
		MAnnotationMap::iterator itMAnnotations = multilineAnnotations_.find(cookie);
		if(itMAnnotations != multilineAnnotations_.end()) {
			multilineAnnotations_.erase(itMAnnotations);
			type = Token::ANNOTATION;
		} else {
			SAnnotationBMap::iterator itSBAnnotations = singlelineAnnotationBs_.find(cookie);
			if(itSBAnnotations != singlelineAnnotationBs_.end()) {
				singlelineAnnotationBs_.erase(itSBAnnotations);
				type = Token::ANNOTATION;
			} else {
				SAnnotationDMap::iterator itSDAnnotations = singlelineAnnotationDs_.find(cookie);
				if(itSDAnnotations != singlelineAnnotationDs_.end()) {
					singlelineAnnotationDs_.erase(itSDAnnotations);
					type = Token::ANNOTATION;
				}
			}
		}
	}
	if(type == Token::COUNT)
		throw invalid_argument("Specified cookie value is invalid.");
	if(eventListener_ != 0)
		eventListener_->onLexerRemovedIdentifiedToken(type, cookie);
	notifyChange();
}

/// SĂ̐ݒ荀ڂԂɖ߂
void Lexer::reset() {
	assertValid();
	caseSensitive_ = true;
	escapeByBackSolidus_ = true;
	enableUnicodeAlphabets_ = true;
	enableUnicodeWhiteSpaces_ = true;
	numberFormat_ = NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL;
	fill(enabledTokenTypes_, endof(enabledTokenTypes_), true);
	removeAll();
}

/**
 *	̃At@xbgȊOɁAP\ƂĂ݂ȂR[h|Cgݒ肷
 *	@param first, last ׂBLȃTQ[gyA͔ BMP ɕϊ
 */
void Lexer::setAdditionalAlphabets(const char_t* first, const char_t* last) {
	assertValid();
	assert(first != 0 && last != 0);
	additionalAlphabets_.clear();
	for(; first < last; ++first) {
		const CodePoint cp = UTF16Surrogates::decode(first, last - first);
		additionalAlphabets_.insert(cp);
		if(cp > 0xFFFF)
			++first;
	}
	notifyChange();
}

/**
 *	̃At@xbgȊOɁAP\ƂĂ݂ȂR[h|Cgݒ肷
 *	@param first, last R[h|Cg
 */
void Lexer::setAdditionalAlphabets(const CodePoint* first, const CodePoint* last) {
	assertValid();
	assert(first != 0 && last != 0);
	additionalAlphabets_.clear();
	additionalAlphabets_.insert(first, last);
	notifyChange();
}

/**
 *	ʂƂĎgp镶ݒ肷
 *	@param brackets					Jʂׂ (NUL I)
 *	@throw std::invalid_argument	JʂƂĎgpłȂɊ܂܂ĂƂX[
 */
void Lexer::setBrackets(const char_t* brackets) {
	assertValid();
	assert(brackets != 0);

	ostringstream_t	ss;

	for(size_t i = 0; brackets[i] != 0; ++i) {
		for(size_t j = 0; ; ++j) {
			if(bracketPairs[j].first == 0 || bracketPairs[j].second == brackets[i])
				throw invalid_argument("Specified character can not be used as an opener.");
			else if(bracketPairs[j].first == brackets[i]) {
				ss << bracketPairs[j].first << bracketPairs[j].second;
				break;
			}
		}
	}
	const string_t str = ss.str();
	delete[] brackets_;
	brackets_ = new char_t[str.length() + 1];
	wcscpy(brackets_, str.c_str());
}

/**
 *	Zqݒ肷
 *	@param operators Zq̏W
 */
void Lexer::setOperators(const set<string_t>& operators) {
	assertValid();
	operators_.clear();
	for(set<string_t>::const_iterator it = operators.begin(); it != operators.end(); ++it) {
		if(it->empty())
			continue;
		operators_[it->at(0)].insert(*it);
	}
	notifyChange();
}

/* [EOF] */