// Lexer.cpp
// (c) 2004-2005 exeal

#include "StdAfx.h"
#include "Lexer.h"
#include "TextSearcher.h"	// CBoundarySearcher::IsGraphemeBase

using namespace Ascension;
using namespace std;


namespace {
#if ASCENSION_UNICODE_VERSION != 0x0410
#error This array is based on old version of Unicode.
#endif
	const pair<char_t, char_t>	bracketPairs[] = {	//  Ps APe  XML p '<'  '>'
		make_pair(0x0028, 0x0029),	// Parenthesis
		make_pair(0x003C, 0x003E),	// [for XML] Less-Than/Greater-Than Sign
		make_pair(0x005B, 0x005D),	// Square Bracket
		make_pair(0x007B, 0x007D),	// Curly Bracket
		make_pair(0x0F3A, 0x0F3B),	// Tibetan Mark Gug Rtags Gyon and Gyas
		make_pair(0x0F3C, 0x0F3D),	// Tibetan Mark Ang Khang Gyon and Gyas
		make_pair(0x169B, 0x169C),	// Ogham Feather Maek and reversed one
//		make_pair(0x201A, 0x????),	// Single Low-9 Quotation Mark
//		make_pair(0x201E, 0x????),	// Double Low-9 Quotation Mark
		make_pair(0x2045, 0x2046),	// Square Bracket With Quill
		make_pair(0x207D, 0x207E),	// Superscript Parenthesis
		make_pair(0x208D, 0x208E),	// Subscript Parenthesis
		make_pair(0x2329, 0x232A),	// Pointing Angle Bracket
		make_pair(0x23B4, 0x23B5),	// Square Bracket (top/bottom)
		make_pair(0x2768, 0x2769),	// Medium Parenthesis Ornament
		make_pair(0x276A, 0x276B),	// Medium Flattened Parenthesis Ornament
		make_pair(0x276C, 0x276D),	// Medium Pointing Angle Bracket Ornament
		make_pair(0x276E, 0x276F),	// Heavy Pointing Angle Quotation Mark Ornament
		make_pair(0x2770, 0x2771),	// Heavy Pointing Angle Bracket Ornament
		make_pair(0x2772, 0x2773),	// Light Tortoise Shell Bracket Ornament
		make_pair(0x2774, 0x2775),	// Medium Curly Bracket Ornament
		make_pair(0x27C5, 0x27C6),	// S-Shaped Bag Delimiter
		make_pair(0x27E6, 0x27E7),	// Mathematical White Square Bracket
		make_pair(0x27E8, 0x27E9),	// Mathematical Angle Bracket
		make_pair(0x27EA, 0x27EB),	// Mathematical Double Angle Bracket
		make_pair(0x2983, 0x2984),	// White Curly Barcket
		make_pair(0x2985, 0x2986),	// White Parenthesis
		make_pair(0x2987, 0x2988),	// Z Notation Image Bracket
		make_pair(0x2989, 0x298A),	// Z Notation Binding Bracket
		make_pair(0x298B, 0x298C),	// Square Bracket With Underbar
		make_pair(0x298D, 0x298E),	// Left Square Bracket With Tick In Top Corner and Right ... Bottom
		make_pair(0x298F, 0x2990),	// Left Square Bracket With Tick In Bottom Corner and Right ... Top
		make_pair(0x2991, 0x2992),	// Angle Bracket With Dot
		make_pair(0x2993, 0x2994),	// Arc Less-Than Bracket
		make_pair(0x2995, 0x2996),	// Double Arc Greater-Than Bracket
		make_pair(0x2997, 0x2998),	// Black Tortoise Shell Bracket
		make_pair(0x29D8, 0x29D9),	// Wiggly Fence
		make_pair(0x29DA, 0x29DB),	// Double Wiggly Fence
		make_pair(0x29FC, 0x29FD),	// Pointing Curved Angle Bracket
		make_pair(0x3008, 0x3009),	// Angle Bracket
		make_pair(0x300A, 0x300B),	// Double Angle Bracket
		make_pair(0x300C, 0x300D),	// Corner Bracket
		make_pair(0x300E, 0x300F),	// White Corner Bracket
		make_pair(0x3010, 0x3011),	// Black Lenticular Bracket
		make_pair(0x3014, 0x3015),	// Tortoise Shell Bracket
		make_pair(0x3016, 0x3017),	// White Lenticular Bracket
		make_pair(0x3018, 0x3019),	// White Tortoise Shell Bracket
		make_pair(0x301A, 0x301B),	// White Square Bracket
		make_pair(0x301D, 0x301F),	// Double Prime Quotation Mark and reversed one
//		make_pair(0x????, 0x301E),	// Double Prime Quotation Mark (deprecated: mistaken analogue)
		make_pair(0xFD3E, 0xFD3F),	// Ornate Parenthesis
		make_pair(0xFE17, 0xEF18),	// Presentation Form For Vertical Left White Lenticular Bracket
		make_pair(0xFE35, 0xFE36),	// Presentation Form For Vertical Parenthesis
		make_pair(0xFE37, 0xFE38),	// - Curly Bracket
		make_pair(0xFE39, 0xFE3A),	// - Tortoise Shell Bracket
		make_pair(0xFE3B, 0xFE3C),	// - Black Lenticular Bracket
		make_pair(0xFE3D, 0xFE3E),	// - Double Angle Bracket
		make_pair(0xFE3F, 0xFE40),	// - Angle Bracket
		make_pair(0xFE41, 0xFE42),	// - Corner Bracket
		make_pair(0xFE43, 0xFE44),	// - White Corner Bracket
		make_pair(0xFE45, 0xFE46),	// Sesame Dot and White one
		make_pair(0xFE47, 0xFE48),	// - Square Bracket
		make_pair(0xFE59, 0xFE5A),	// Small Parenthesis
		make_pair(0xFE5B, 0xFE5C),	// Small Curly Bracket
		make_pair(0xFE5D, 0xFE5E),	// Small Tortoise Shell Bracket
		make_pair(0xFF08, 0xFF09),	// Fullwidth Parenthesis
		make_pair(0xFF3B, 0xFF3D),	// Fullwidth Square Bracket
		make_pair(0xFF5B, 0xFF5D),	// Fullwidth Curly Bracket
		make_pair(0xFF5F, 0xFF60),	// Fullwidth White Parenthesis
		make_pair(0xFF62, 0xFF63),	// Halfwidth Corner Bracket
		make_pair(0x0000, 0x0000)	// _~[
	};

#if ASCENSION_UNICODE_VERSION != 0x0410
#error These arrays are based on old version of Unicode.
#endif
	// DerivedCoreProperties.txt  ID_Start vpeBR[h|Cg
	const CodePoint	IDStartCodePoints[] = {
#include "script\Lexer_IdentifierStart"
	};
	// ʕނ Mn (Mark, Non-Spacing)AMc (Mark, Spacing Combining)A
	// Nd (Number, Decimal)APc (Punctuation, Connector) ̂ꂩłR[h|Cg
	const CodePoint	IDContinueCodePoints[] = {
#include "script\Lexer_IdentifierContinue"
	};
}


// CLexer class implementation
/////////////////////////////////////////////////////////////////////////////

///	ʓIȊJʂ̃Xg
const char_t	CLexer::m_wszDefaultOpeners[] = L"([{";
///	Unicode ̊Jʂ̃Xg
const char_t	CLexer::m_wszUnicodeOpeners[] = L"([{\x0F3A\xF3C\x169B\x2045\x207D\x208D\x2329\x23B4"
												L"\x2768\x276A\x276C\x276E\x2770\x2772\x2774\x27C5"
												L"\x27E6\x27E8\x27EA\x2983\x2985\x2987\x2989\x298B"
												L"\x298D\x298F\x2991\x2993\x2995\x2997\x29D8\x29DA"
												L"\x29FC\x3008\x300A\x300C\x300E\x3010\x3014\x3016"
												L"\x3018\x301A\x301D\xFD3E\xFE35\xFE37\xFE39\xFE3B"
												L"\xFE3D\xFE3F\xFE41\xFE43\xFE45\xFE47\xFE59\xFE5B"
												L"\xFE5D\xFF08\xFF3B\xFF5B\xFF5F\xFF62";

TokenCookie		CLexer::m_nCookie = NullCookie + 1;

// Arrays for case folding from CaseFolding.txt of UCD
#if ASCENSION_UNICODE_VERSION != 0x0410
#error These arrays are based on old version of Unicode.
#endif
const char_t	CLexer::m_casedCodesUcs2[] = {
	0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048,	// Basic Latin
	0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050,
	0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
	0x0059, 0x005A, 0x00B5,
	0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,	// Latin-1 Supplement
	0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
	0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D8,
	0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE,
	0x0100, 0x0102, 0x0104, 0x0106, 0x0108, 0x010A, 0x010C, 0x010E,	// Latin Extended-A
	0x0110, 0x0112, 0x0114, 0x0116, 0x0118, 0x011A, 0x011C, 0x011E,
	0x0120, 0x0122, 0x0124, 0x0126, 0x0128, 0x012A, 0x012C, 0x012E,
	0x0132, 0x0134, 0x0136, 0x0139, 0x013B, 0x013D, 0x013F, 0x0141,
	0x0143, 0x0145, 0x0147, 0x014A, 0x014C, 0x014E, 0x0150, 0x0152,
	0x0154, 0x0156, 0x0158, 0x015A, 0x015C, 0x015E, 0x0160, 0x0162,
	0x0164, 0x0166, 0x0168, 0x016A, 0x016C, 0x016E, 0x0170, 0x0172,
	0x0174, 0x0176, 0x0178, 0x0179, 0x017B, 0x017D, 0x017F,
	0x0181, 0x0182, 0x0184, 0x0186, 0x0187, 0x0189, 0x018A, 0x018B,	// Latin Extended-B
	0x018E, 0x018F, 0x0190, 0x0191, 0x0193, 0x0194, 0x0196, 0x0197,
	0x0198, 0x019C, 0x019D, 0x019F, 0x01A0, 0x01A2, 0x01A4, 0x01A6,
	0x01A7, 0x01A9, 0x01AC, 0x01AE, 0x01AF, 0x01B1, 0x01B2, 0x01B3,
	0x01B5, 0x01B7, 0x01B8, 0x01BC, 0x01C4, 0x01C5, 0x01C7, 0x01C8,
	0x01CA, 0x01CB, 0x01CD, 0x01CF, 0x01D1, 0x01D3, 0x01D5, 0x01D7,
	0x01D9, 0x01DB, 0x01DE, 0x01E0, 0x01E2, 0x01E4, 0x01E6, 0x01E8,
	0x01EA, 0x01EC, 0x01EE, 0x01F1, 0x01F2, 0x01F4, 0x01F6, 0x01F7,
	0x01F8, 0x01FA, 0x01FC, 0x01FE, 0x0200, 0x0202, 0x0204, 0x0206,
	0x0208, 0x020A, 0x020C, 0x020E, 0x0210, 0x0212, 0x0214, 0x0216,
	0x0218, 0x021A, 0x021C, 0x021E, 0x0220, 0x0222, 0x0224, 0x0226,
	0x0228, 0x022A, 0x022C, 0x022E, 0x0230, 0x0232,
	0x0345,															// Combining Diacritical Marks
	0x0386, 0x0388, 0x0389, 0x038A, 0x038C, 0x038E, 0x038F, 0x0391,	// Greek
	0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399,
	0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F, 0x03A0, 0x03A1,
	0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA,
	0x03AB, 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03D8, 0x03DA,
	0x03DC, 0x03DE, 0x03E0, 0x03E2, 0x03E4, 0x03E6, 0x03E8, 0x03EA,
	0x03EC, 0x03EE, 0x03F0, 0x03F1, 0x03F4, 0x03F5, 0x03F7, 0x03F9,
	0x03FA,
	0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,	// Cyrillic
	0x0408, 0x0409, 0x040A, 0x040B, 0x040C, 0x040D, 0x040E, 0x040F,
	0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
	0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
	0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
	0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
	0x0460, 0x0462, 0x0464, 0x0466, 0x0468, 0x046A, 0x046C, 0x046E,
	0x0470, 0x0472, 0x0474, 0x0476, 0x0478, 0x047A, 0x047C, 0x047E,
	0x0480, 0x048A, 0x048C, 0x048E, 0x0490, 0x0492, 0x0494, 0x0496,
	0x0498, 0x049A, 0x049C, 0x049E, 0x04A0, 0x04A2, 0x04A4, 0x04A6,
	0x04A8, 0x04AA, 0x04AC, 0x04AE, 0x04B0, 0x04B2, 0x04B4, 0x04B6,
	0x04B8, 0x04BA, 0x04BC, 0x04BE, 0x04C1, 0x04C3, 0x04C5, 0x04C7,
	0x04C9, 0x04CB, 0x04CD, 0x04D0, 0x04D2, 0x04D4, 0x04D6, 0x04D8,
	0x04DA, 0x04DC, 0x04DE, 0x04E0, 0x04E2, 0x04E4, 0x04E6, 0x04E8,
	0x04EA, 0x04EC, 0x04EE, 0x04F0, 0x04F2, 0x04F4, 0x04F8, 0x0500,
	0x0502, 0x0504, 0x0506, 0x0508, 0x050A, 0x050C, 0x050E,
	0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, 0x0538,	// Armenian
	0x0539, 0x053A, 0x053B, 0x053C, 0x053D, 0x053E, 0x053F, 0x0540,
	0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, 0x0548,
	0x0549, 0x054A, 0x054B, 0x054C, 0x054D, 0x054E, 0x054F, 0x0550,
	0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556,
	0x1E00, 0x1E02, 0x1E04, 0x1E06, 0x1E08, 0x1E0A, 0x1E0C, 0x1E0E,	// Latin Extended Additional
	0x1E10, 0x1E12, 0x1E14, 0x1E16, 0x1E18, 0x1E1A, 0x1E1C, 0x1E1E,
	0x1E20, 0x1E22, 0x1E24, 0x1E26, 0x1E28, 0x1E2A, 0x1E2C, 0x1E2E,
	0x1E30, 0x1E32, 0x1E34, 0x1E36, 0x1E38, 0x1E3A, 0x1E3C, 0x1E3E,
	0x1E40, 0x1E42, 0x1E44, 0x1E46, 0x1E48, 0x1E4A, 0x1E4C, 0x1E4E,
	0x1E50, 0x1E52, 0x1E54, 0x1E56, 0x1E58, 0x1E5A, 0x1E5C, 0x1E5E,
	0x1E60, 0x1E62, 0x1E64, 0x1E66, 0x1E68, 0x1E6A, 0x1E6C, 0x1E6E,
	0x1E70, 0x1E72, 0x1E74, 0x1E76, 0x1E78, 0x1E7A, 0x1E7C, 0x1E7E,
	0x1E80, 0x1E82, 0x1E84, 0x1E86, 0x1E88, 0x1E8A, 0x1E8C, 0x1E8E,
	0x1E90, 0x1E92, 0x1E94, 0x1E9B, 0x1EA0, 0x1EA2, 0x1EA4, 0x1EA6,
	0x1EA8, 0x1EAA, 0x1EAC, 0x1EAE, 0x1EB0, 0x1EB2, 0x1EB4, 0x1EB6,
	0x1EB8, 0x1EBA, 0x1EBC, 0x1EBE, 0x1EC0, 0x1EC2, 0x1EC4, 0x1EC6,
	0x1EC8, 0x1ECA, 0x1ECC, 0x1ECE, 0x1ED0, 0x1ED2, 0x1ED4, 0x1ED6,
	0x1ED8, 0x1EDA, 0x1EDC, 0x1EDE, 0x1EE0, 0x1EE2, 0x1EE4, 0x1EE6,
	0x1EE8, 0x1EEA, 0x1EEC, 0x1EEE, 0x1EF0, 0x1EF2, 0x1EF4, 0x1EF6,
	0x1EF8,
	0x1F08, 0x1F09, 0x1F0A, 0x1F0B, 0x1F0C, 0x1F0D, 0x1F0E, 0x1F0F,	// Greek Extended
	0x1F18, 0x1F19, 0x1F1A, 0x1F1B, 0x1F1C, 0x1F1D, 0x1F28, 0x1F29,
	0x1F2A, 0x1F2B, 0x1F2C, 0x1F2D, 0x1F2E, 0x1F2F, 0x1F38, 0x1F39,
	0x1F3A, 0x1F3B, 0x1F3C, 0x1F3D, 0x1F3E, 0x1F3F, 0x1F48, 0x1F49,
	0x1F4A, 0x1F4B, 0x1F4C, 0x1F4D, 0x1F59, 0x1F5B, 0x1F5D, 0x1F5F,
	0x1F68, 0x1F69, 0x1F6A, 0x1F6B, 0x1F6C, 0x1F6D, 0x1F6E, 0x1F6F,
	0x1F88, 0x1F89, 0x1F8A, 0x1F8B, 0x1F8C, 0x1F8D, 0x1F8E, 0x1F8F,
	0x1F98, 0x1F99, 0x1F9A, 0x1F9B, 0x1F9C, 0x1F9D, 0x1F9E, 0x1F9F,
	0x1FA8, 0x1FA9, 0x1FAA, 0x1FAB, 0x1FAC, 0x1FAD, 0x1FAE, 0x1FAF,
	0x1FB8, 0x1FB9, 0x1FBA, 0x1FBB, 0x1FBC, 0x1FBE, 0x1FC8, 0x1FC9,
	0x1FCA, 0x1FCB, 0x1FCC, 0x1FD8, 0x1FD9, 0x1FDA, 0x1FDB, 0x1FE8,
	0x1FE9, 0x1FEA, 0x1FEB, 0x1FEC, 0x1FF8, 0x1FF9, 0x1FFA, 0x1FFB,
	0x1FFC,
	0x2126, 0x212A, 0x212B,											// Letterlike Symbols
	0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167,	// Number Forms
	0x2168, 0x2169, 0x216A, 0x216B, 0x216C, 0x216D, 0x216E, 0x216F,
	0x24B6, 0x24B7, 0x24B8, 0x24B9, 0x24BA, 0x24BB, 0x24BC, 0x24BD,	// Enclosed Alphanumerics
	0x24BE, 0x24BF, 0x24C0, 0x24C1, 0x24C2, 0x24C3, 0x24C4, 0x24C5,
	0x24C6, 0x24C7, 0x24C8, 0x24C9, 0x24CA, 0x24CB, 0x24CC, 0x24CD,
	0x24CE, 0x24CF,
	0xFF21, 0xFF22, 0xFF23, 0xFF24, 0xFF25, 0xFF26, 0xFF27, 0xFF28,	// Halfwidth and Fullwidth Forms
	0xFF29, 0xFF2A, 0xFF2B, 0xFF2C, 0xFF2D, 0xFF2E, 0xFF2F, 0xFF30,
	0xFF31, 0xFF32, 0xFF33, 0xFF34, 0xFF35, 0xFF36, 0xFF37, 0xFF38,
	0xFF39, 0xFF3A,
};
const char_t	CLexer::m_caseFoldedUcs2[] = {
	0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068,	// Basic Latin
	0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, 0x0070,
	0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
	0x0079, 0x007A, 0x03BC,
	0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,	// Latin-1 Supplement
	0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
	0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F8,
	0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE,
	0x0101, 0x0103, 0x0105, 0x0107, 0x0109, 0x010B, 0x010D, 0x010F,	// Latin Extended-A
	0x0111, 0x0113, 0x0115, 0x0117, 0x0119, 0x011B, 0x011D, 0x011F,
	0x0121, 0x0123, 0x0125, 0x0127, 0x0129, 0x012B, 0x012D, 0x012F,
	0x0133, 0x0135, 0x0137, 0x013A, 0x013C, 0x013E, 0x0140, 0x0142,
	0x0144, 0x0146, 0x0148, 0x014B, 0x014D, 0x014F, 0x0151, 0x0153,
	0x0155, 0x0157, 0x0159, 0x015B, 0x015D, 0x015F, 0x0161, 0x0163,
	0x0165, 0x0167, 0x0169, 0x016B, 0x016D, 0x016F, 0x0171, 0x0173,
	0x0175, 0x0177, 0x00FF, 0x017A, 0x017C, 0x017E, 0x0073,
	0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257, 0x018C,	// Latin Extended-B
	0x01DD, 0x0259, 0x025B, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268,
	0x0199, 0x026F, 0x0272, 0x0275, 0x01A1, 0x01A3, 0x01A5, 0x0280,
	0x01A8, 0x0283, 0x01AD, 0x0288, 0x01B0, 0x028A, 0x028B, 0x01B4,
	0x01B6, 0x0292, 0x01B9, 0x01BD, 0x01C6, 0x01C6, 0x01C9, 0x01C9,
	0x01CC, 0x01CC, 0x01CE, 0x01D0, 0x01D2, 0x01D4, 0x01D6, 0x01D8,
	0x01DA, 0x01DC, 0x01DF, 0x01E1, 0x01E3, 0x01E5, 0x01E7, 0x01E9,
	0x01EB, 0x01ED, 0x01EF, 0x01F3, 0x01F3, 0x01F5, 0x0195, 0x01BF,
	0x01F9, 0x01FB, 0x01FD, 0x01FF, 0x0201, 0x0203, 0x0205, 0x0207,
	0x0209, 0x020B, 0x020D, 0x020F, 0x0211, 0x0213, 0x0215, 0x0217,
	0x0219, 0x021B, 0x021D, 0x021F, 0x019E, 0x0223, 0x0225, 0x0227,
	0x0229, 0x022B, 0x022D, 0x022F, 0x0231, 0x0233,
	0x03B9,															// Combining Diacritical Marks
	0x03AC, 0x03AD, 0x03AE, 0x03AF, 0x03CC, 0x03CD, 0x03CE, 0x03B1,	// Greek
	0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9,
	0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF, 0x03C0, 0x03C1,
	0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA,
	0x03CB, 0x03C3, 0x03B2, 0x03B8, 0x03C6, 0x03C0, 0x03D9, 0x03DB,
	0x03DD, 0x03DF, 0x03E1, 0x03E3, 0x03E5, 0x03E7, 0x03E9, 0x03EB,
	0x03ED, 0x03EF, 0x03BA, 0x03C1, 0x03B8, 0x03B5, 0x03F8, 0x03F2,
	0x03FB,
	0x0450, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,	// Cyrillic
	0x0458, 0x0459, 0x045A, 0x045B, 0x045C, 0x045D, 0x045E, 0x045F,
	0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
	0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
	0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
	0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
	0x0461, 0x0463, 0x0465, 0x0467, 0x0469, 0x046B, 0x046D, 0x046F,
	0x0471, 0x0473, 0x0475, 0x0477, 0x0479, 0x047B, 0x047D, 0x047F,
	0x0481, 0x048B, 0x048D, 0x048F, 0x0491, 0x0493, 0x0495, 0x0497,
	0x0499, 0x049B, 0x049D, 0x049F, 0x04A1, 0x04A3, 0x04A5, 0x04A7,
	0x04A9, 0x04AB, 0x04AD, 0x04AF, 0x04B1, 0x04B3, 0x04B5, 0x04B7,
	0x04B9, 0x04BB, 0x04BD, 0x04BF, 0x04C2, 0x04C4, 0x04C6, 0x04C8,
	0x04CA, 0x04CC, 0x04CE, 0x04D1, 0x04D3, 0x04D5, 0x04D7, 0x04D9,
	0x04DB, 0x04DD, 0x04DF, 0x04E1, 0x04E3, 0x04E5, 0x04E7, 0x04E9,
	0x04EB, 0x04ED, 0x04EF, 0x04F1, 0x04F3, 0x04F5, 0x04F9, 0x0501,
	0x0503, 0x0505, 0x0507, 0x0509, 0x050B, 0x050D, 0x050F,
	0x0561, 0x0562, 0x0563, 0x0564, 0x0565, 0x0566, 0x0567, 0x0568,	// Armenian
	0x0569, 0x056A, 0x056B, 0x056C, 0x056D, 0x056E, 0x056F, 0x0570,
	0x0571, 0x0572, 0x0573, 0x0574, 0x0575, 0x0576, 0x0577, 0x0578,
	0x0579, 0x057A, 0x057B, 0x057C, 0x057D, 0x057E, 0x057F, 0x0580,
	0x0581, 0x0582, 0x0583, 0x0584, 0x0585, 0x0586,
	0x1E01, 0x1E03, 0x1E05, 0x1E07, 0x1E09, 0x1E0B, 0x1E0D, 0x1E0F,	// Latin Extended Additional
	0x1E11, 0x1E13, 0x1E15, 0x1E17, 0x1E19, 0x1E1B, 0x1E1D, 0x1E1F,
	0x1E21, 0x1E23, 0x1E25, 0x1E27, 0x1E29, 0x1E2B, 0x1E2D, 0x1E2F,
	0x1E31, 0x1E33, 0x1E35, 0x1E37, 0x1E39, 0x1E3B, 0x1E3D, 0x1E3F,
	0x1E41, 0x1E43, 0x1E45, 0x1E47, 0x1E49, 0x1E4B, 0x1E4D, 0x1E4F,
	0x1E51, 0x1E53, 0x1E55, 0x1E57, 0x1E59, 0x1E5B, 0x1E5D, 0x1E5F,
	0x1E61, 0x1E63, 0x1E65, 0x1E67, 0x1E69, 0x1E6B, 0x1E6D, 0x1E6F,
	0x1E71, 0x1E73, 0x1E75, 0x1E77, 0x1E79, 0x1E7B, 0x1E7D, 0x1E7F,
	0x1E81, 0x1E83, 0x1E85, 0x1E87, 0x1E89, 0x1E8B, 0x1E8D, 0x1E8F,
	0x1E91, 0x1E93, 0x1E95, 0x1E61, 0x1EA1, 0x1EA3, 0x1EA5, 0x1EA7,
	0x1EA9, 0x1EAB, 0x1EAD, 0x1EAF, 0x1EB1, 0x1EB3, 0x1EB5, 0x1EB7,
	0x1EB9, 0x1EBB, 0x1EBD, 0x1EBF, 0x1EC1, 0x1EC3, 0x1EC5, 0x1EC7,
	0x1EC9, 0x1ECB, 0x1ECD, 0x1ECF, 0x1ED1, 0x1ED3, 0x1ED5, 0x1ED7,
	0x1ED9, 0x1EDB, 0x1EDD, 0x1EDF, 0x1EE1, 0x1EE3, 0x1EE5, 0x1EE7,
	0x1EE9, 0x1EEB, 0x1EED, 0x1EEF, 0x1EF1, 0x1EF3, 0x1EF5, 0x1EF7,
	0x1EF9,
	0x1F00, 0x1F01, 0x1F02, 0x1F03, 0x1F04, 0x1F05, 0x1F06, 0x1F07,	// Greek Extended
	0x1F10, 0x1F11, 0x1F12, 0x1F13, 0x1F14, 0x1F15, 0x1F20, 0x1F21,
	0x1F22, 0x1F23, 0x1F24, 0x1F25, 0x1F26, 0x1F27, 0x1F30, 0x1F31,
	0x1F32, 0x1F33, 0x1F34, 0x1F35, 0x1F36, 0x1F37, 0x1F40, 0x1F41,
	0x1F42, 0x1F43, 0x1F44, 0x1F45, 0x1F51, 0x1F53, 0x1F55, 0x1F57,
	0x1F60, 0x1F61, 0x1F62, 0x1F63, 0x1F64, 0x1F65, 0x1F66, 0x1F67,
	0x1F80, 0x1F81, 0x1F82, 0x1F83, 0x1F84, 0x1F85, 0x1F86, 0x1F87,
	0x1F90, 0x1F91, 0x1F92, 0x1F93, 0x1F94, 0x1F95, 0x1F96, 0x1F97,
	0x1FA0, 0x1FA1, 0x1FA2, 0x1FA3, 0x1FA4, 0x1FA5, 0x1FA6, 0x1FA7,
	0x1FB0, 0x1FB1, 0x1F70, 0x1F71, 0x1FB3, 0x03B9, 0x1F72, 0x1F73,
	0x1F74, 0x1F75, 0x1FC3, 0x1FD0, 0x1FD1, 0x1F76, 0x1F77, 0x1FE0,
	0x1FE1, 0x1F7A, 0x1F7B, 0x1FE5, 0x1F78, 0x1F79, 0x1F7C, 0x1F7D,
	0x1FF3,
	0x03C9, 0x006B, 0x00E5,											// Letterlike Symbols
	0x2170, 0x2171, 0x2172, 0x2173, 0x2174, 0x2175, 0x2176, 0x2177,	// Number Forms
	0x2178, 0x2179, 0x217A, 0x217B, 0x217C, 0x217D, 0x217E, 0x217F, 
	0x24D0, 0x24D1, 0x24D2, 0x24D3, 0x24D4, 0x24D5, 0x24D6, 0x24D7,	// Enclosed Alphanumerics
	0x24D8, 0x24D9, 0x24DA, 0x24DB, 0x24DC, 0x24DD, 0x24DE, 0x24DF,
	0x24E0, 0x24E1, 0x24E2, 0x24E3, 0x24E4, 0x24E5, 0x24E6, 0x24E7,
	0x24E8, 0x24E9,
	0xFF41, 0xFF42, 0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, 0xFF48,	// Halfwidth and Fullwidth Forms
	0xFF49, 0xFF4A, 0xFF4B, 0xFF4C, 0xFF4D, 0xFF4E, 0xFF4F, 0xFF50,
	0xFF51, 0xFF52, 0xFF53, 0xFF54, 0xFF55, 0xFF56, 0xFF57, 0xFF58,
	0xFF59, 0xFF5A,
};
const size_t	CLexer::m_cCasedCodesUcs2 = _countof(CLexer::m_casedCodesUcs2);


/**
 *	RXgN^
 *	@param pEventListener	Cxgnh (null ł悢)
 */
CLexer::CLexer(CLexer::IEventListener* pEventListener) :
		m_bFreezed(false), m_bCaseSensitive(true), m_bEscapeByBackSolidus(true),
		m_bEnableUnicodeAlphabets(true), m_bEnableUnicodeWhiteSpaces(true),
		m_numberFormat(NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL),
		m_pwszBrackets(0), m_pEventListener(pEventListener) {
	fill(m_enabledTokenTypes, _endof(m_enabledTokenTypes), true);
	SetBrackets(CLexer::m_wszDefaultOpeners);
}

///	fXgN^
CLexer::~CLexer() {
	_ClearKeywords();
	delete[] m_pwszBrackets;
}

/**
 *	L[[h`ǉ
 *	@param keywords	ǉL[[h̏W
 *	@return			NbL[
 */
TokenCookie CLexer::AddKeywords(const set<string_t>& keywords) {
	AssertValid();

	_CHashTable*	pKeywords = new _CHashTable(keywords, m_bCaseSensitive);

	m_keywords.insert(make_pair(CLexer::m_nCookie, pKeywords));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_KEYWORD, CLexer::m_nCookie);
	_NotifyChange();
	return CLexer::m_nCookie++;
}

/**
 *	sRg`ǉ
 *	@param strStartDelimiter	Jn
 *	@param strEndDelimiter		I
 *	@param constraint			Jnf~^̐
 *	@return						NbL[
 */
TokenCookie CLexer::AddMultilineAnnotation(const string_t& strStartDelimiter,
		const string_t& strEndDelimiter, AnnotationConstraint constraint /* = AC_NONE */) {
	AssertValid();

	const _MultilineAnnotation	annotation = {strStartDelimiter, strEndDelimiter, constraint};

	m_multilineAnnotations.insert(make_pair(CLexer::m_nCookie, annotation));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_ANNOTATION, CLexer::m_nCookie);
	_NotifyChange();

	return CLexer::m_nCookie++;
}

/**
 *	sŏIPsߒ`ǉ
 *	@param strStartDelimiter	Jn
 *	@param constraint			Jnf~^̐
 *	@return						NbL[
 */
TokenCookie CLexer::AddSinglelineAnnotation(const string_t& strStartDelimiter, AnnotationConstraint constraint /* = AC_NONE */) {
	AssertValid();

	const _SinglelineAnnotationEndedByBreak	annotation = {strStartDelimiter, constraint};

	m_singlelineAnnotationBs.insert(make_pair(CLexer::m_nCookie, annotation));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_ANNOTATION, CLexer::m_nCookie);
	_NotifyChange();

	return CLexer::m_nCookie++;
}

/**
 *	sŏIPsߒ`ǉ
 *	@param strStartDelimiter	Jn
 *	@param strEndDelimiter		I
 *	@param constraint			Jnf~^̐
 *	@return						NbL[
 */
TokenCookie CLexer::AddSinglelineAnnotation(const string_t& strStartDelimiter,
		const string_t& strEndDelimiter, AnnotationConstraint constraint /* = AC_NONE */) {
	AssertValid();

	const _SinglelineAnnotationEndedByDelimiter	annotation = {
		strStartDelimiter, strEndDelimiter, constraint
	};

	m_singlelineAnnotationDs.insert(make_pair(CLexer::m_nCookie, annotation));
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerAddedIdentifiedToken(TT_ANNOTATION, CLexer::m_nCookie);
	_NotifyChange();

	return CLexer::m_nCookie++;
}

/// L[[hSč폜
void CLexer::_ClearKeywords() {
	for(_KeywordsMap::iterator it = m_keywords.begin(); it != m_keywords.end(); ++it)
		delete it->second;
	m_keywords.clear();
}

/**
 *	L[[h̒Ԃ
 *	@param pwsz		ׂ镶
 *	@param cch		
 *	@param nCookie	[out] L[[hɊ֘AtꂽNbL[l
 *	@return			L[[hłꍇ true
 */
bool CLexer::EatKeyword(const char_t* pwsz, length_t cch, TokenCookie& nCookie) const {
	AssertValid();
	assert(pwsz != 0 && cch != 0);

	for(_KeywordsMap::const_iterator it = m_keywords.begin(); it != m_keywords.end(); ++it) {
		if(it->second->Find(pwsz, cch)) {
			nCookie = it->first;
			return true;
		}
	}
	return false;
}

/**
 *	񂪃[AhX𒲂ׂ
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		񂪃[AhXł΂̒BȊȌꍇ0
 */
length_t CLexer::EatMailAddress(const char_t* pwsz, length_t cch) {
	// ̃\bh "/[\w\d][\w\d\.\-_]*@[\w\d\-_]+(\.[\w\d\-_]+)+/" ̂悤ȃp^[}b`s
#define IS_ALNUM(ch)					\
	(((ch) >= L'A' && (ch) <= L'Z')		\
	|| ((ch) >= L'a' && (ch) <= L'z')	\
	|| ((ch) >= L'0' && (ch) <= L'9'))
#define IS_ALNUMBAR(ch)	\
	(IS_ALNUM(ch) || ch == L'-' || ch == L'_')

	assert(pwsz != 0);

	if(cch < 5)
		return 0;

	// 1
	if(!IS_ALNUM(pwsz[0]))	return 0;

	// 2ڂ '@'
	length_t	i = 1;
	for(; i < cch - 3; ++i) {
		if(!IS_ALNUMBAR(pwsz[i]) && pwsz[i] != L'.')
			break;
	}
	if(pwsz[i] != L'@' || cch - i == 3)
		return 0;

	// '@' ̌
	const length_t	iAt = i;
	bool			bDotAppeared = false;
	for(i = iAt + 1; i < cch; ++i) {
		if(IS_ALNUMBAR(pwsz[i]))
			continue;
		else if(pwsz[i] == L'.') {
			if(pwsz[i - 1] == L'.')
				return 0;
			bDotAppeared = true;
		} else
			break;
	}
	return (bDotAppeared && (i - iAt > 2)) ? i : 0;
}

/**
 *	sߕ̒Ԃ (JnAIɓ)
 *	@param pwsz			ׂ镶
 *	@param cch			
 *	@param constraint	ׂ悤ƂĂ镶񂪖Ă鐧
 *	@param nCookie		[in, out] ׂ镶̒OsRgłꍇ
 *						̃RgɊ֘AtꂽNbL[ݒ肵ĂB\bhďoA
 *						񂪕sRgł΂Ɋ֘AtꂽNbL[̒lݒ肳
 *						(o͂ɂ NullCookie ͕sRgłȂƂ\̂Ɏg)
 *	@param bContinued	s߂̍sŏIĂȂ true
 *	@return				s߂ł΂̒B߂łȂ 0
 *	@exception invalid_argument	<var>nCookie</var> ȂƂX[
 */
length_t CLexer::EatMultilineAnnotation(const char_t* pwsz, length_t cch,
		AnnotationConstraint constraint, TokenCookie& nCookie, bool& bContinued) const throw(invalid_argument) {
	AssertValid();
	assert(pwsz != 0);

	length_t	i = 0;

	bContinued = false;
	if(nCookie == NullCookie) {	// s߂̊Jnf~^T
		for(_MAnnotationMap::const_iterator it =
				m_multilineAnnotations.begin(); it != m_multilineAnnotations.end(); ++it) {
			if(((it->second.constraint & constraint) != it->second.constraint)
					|| it->second.strStartDelimiter.length() > cch)
				continue;
			else if(wcsncmp(pwsz,
					it->second.strStartDelimiter.data(),
					it->second.strStartDelimiter.length()) == 0) {
				nCookie = it->first;
				i = it->second.strStartDelimiter.length();
				break;
			}
		}
		if(nCookie == NullCookie)
			return 0;
	}
	bContinued = true;
	
	// sRg̏If~^T
	_MAnnotationMap::const_iterator	it = m_multilineAnnotations.find(nCookie);

	if(it == m_multilineAnnotations.end())
		throw invalid_argument("Input cookie value is invalid.");

	const char_t*	pEnd = search(pwsz + i, pwsz + cch,
		it->second.strEndDelimiter.begin(), it->second.strEndDelimiter.end());	// [
	if(pEnd == pwsz + cch)
		return cch;
	bContinued = false;
	return pEnd - pwsz + it->second.strEndDelimiter.length();

//	const length_t	iEnd = string_t(pwsz, cch).find(it->second.strEndDelimiter, i);	// x
//	if(iEnd == string_t::npos)
//		return cch;
//	bContinued = false;
//	return iEnd + it->second.strEndDelimiter.length();
}

template<> length_t inline CLexer::_EatNumbers<NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL>(const char_t* pwsz, length_t cch) const {
	if(!toBoolean(iswdigit(pwsz[0])))
		return 0;

	CodePoint	cp;
	for(length_t i = 1; i < cch; ++i) {
		if(i < cch - 1
				&& IsUtf16HighSurrogate(pwsz[i])
				&& IsUtf16LowSurrogate(pwsz[i + 1]))
			cp = DecodeUtf16SurrogatesToCodePoint(pwsz + i, cch - i);
		else
			cp = pwsz[i];
		if(pwsz[i] == L'.' || IsIdentifierContinueCodePoint(cp)) {
			if(cp > 0xFFFF)
				++i;
		}
		else
			return i;
	}
	return cch;
}

/**
 *	l̒Ԃ
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		l̒BlłȂ0
 */
length_t CLexer::EatNumbers(const char_t* pwsz, length_t cch) const {
	AssertValid();
	assert(pwsz != 0);

	if(cch == 0)
		return 0;

	switch(m_numberFormat) {
	case NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL:	return _EatNumbers<NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL>(pwsz, cch);
/*	case NF_CPLUSPLUS:							return _EatNumbers<NF_CPLUSPLUS>(pwsz, cch);
	case NF_PERL:								return _EatNumbers<NF_PERL>(pwsz, cch);
	case NF_RUBY:								return _EatNumbers<NF_RUBY>(pwsz, cch);
	case NF_VBSCRIPT:							return _EatNumbers<NF_VBSCRIPT>(pwsz, cch);
	case NF_JAVASCRIPT_15:						return _EatNumbers<NF_JAVASCRIPT_15>(pwsz, cch);
	case NF_JAVASCRIPT_20:						return _EatNumbers<NF_JAVASCRIPT_20>(pwsz, cch);
*/	default:									assert(false);
	}
	return 0;
}

/**
 *	Zq̒Ԃ
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		Zq̒BZqłȂ0
 */
length_t CLexer::EatOperators(const char_t* pwsz, length_t cch) const {
	AssertValid();
	assert(pwsz != 0);

	if(cch == 0)
		return 0;

	const _OperatorMap::const_iterator	it = m_operators.find(pwsz[0]);

	if(it == m_operators.end())
		return 0;

	const _OperatorSet&	operators = it->second;
	for(_OperatorSet::const_iterator it =
			operators.begin(); it != operators.end(); ++it) {
		if(it->length() > cch)
			continue;
		else if(wcsncmp(pwsz, it->data(), it->length()) == 0)
			return it->length();
	}
	return 0;
}

/**
 *	Ps߂Ԃ
 *	@param pwsz		ׂ镶
 *	@param cch		
 *	@param ar		ׂ悤ƂĂ镶񂪖Ă鐧
 *	@param nCookie	[out] PsRgɊ֘AtꂽNbL[l
 *	@return			PsRgł΂̒BȊO0
 */
length_t CLexer::EatSinglelineAnnotation(const char_t* pwsz,
		length_t cch, AnnotationConstraint constraint, TokenCookie& nCookie) const {
	AssertValid();

	// sŏI钍߂
	for(_SAnnotationBMap::const_iterator it =
			m_singlelineAnnotationBs.begin(); it != m_singlelineAnnotationBs.end(); ++it) {
		if(((it->second.constraint & constraint) != it->second.constraint)
				|| it->second.strStartDelimiter.length() > cch)
			continue;
		else if(wcsncmp(pwsz, it->second.strStartDelimiter.data(),
				it->second.strStartDelimiter.length()) == 0) {
			nCookie = it->first;
			return cch;
		}
	}

	// wf~^ŏI钍߂
	_SAnnotationDMap::const_iterator	itD;
	for(itD = m_singlelineAnnotationDs.begin(); itD != m_singlelineAnnotationDs.end(); ++itD) {
		if(((itD->second.constraint & constraint) != itD->second.constraint)
				|| itD->second.strStartDelimiter.length() > cch)
			continue;
		else if(wcsncmp(pwsz,
				itD->second.strStartDelimiter.data(),
				itD->second.strStartDelimiter.length()) == 0) {
			nCookie = itD->first;
			break;
		}
	}
	if(itD == m_singlelineAnnotationDs.end())
		return 0;
	const char_t* const	pEnd = search(pwsz + itD->second.strStartDelimiter.length(), pwsz + cch,
								itD->second.strEndDelimiter.begin(), itD->second.strEndDelimiter.end());
	return (pEnd < pwsz + cch) ? pEnd - pwsz + itD->second.strEndDelimiter.length() : cch;
}

/**
 *	@brief	 URL ׂ
 *
 *	_ł͈ȉ̕ URL ̊JnƂ݂Ȃ:
 *	<ul>
 *		<li>file://</li><li>ftp://</li><li>gopher://</li><li>http://</li><li>https://</li>
 *		<li>mailto://</li><li>news://</li><li>nntp://</li><li>telnet://</li><li>wais://</li>
 *	</ul>
 *
 *	@param pwsz	ׂ镶
 *	@param cch	
 *	@return		 URL ̏ꍇ͂̒BȊȌꍇ0
 */
length_t CLexer::EatUrlString(const char_t* pwsz, length_t cch) {
	assert(pwsz != 0);

#define STARTS_WITH(remain_len, start_ch, remain)	\
	((cchUrl = remain_len) && pwsz[0] == start_ch && cch > cchUrl && wcsncmp(pwsz + 1, remain, cchUrl) == 0)

	static const bool	urlChars[] = {	// URI \
		false,	false,	false,	false,	false,	false,	false,	false,	// 0x00
		false,	false,	false,	false,	false,	false,	false,	false,
		false,	false,	false,	false,	false,	false,	false,	false,	// 0x10
		false,	false,	false,	false,	false,	false,	false,	false,
		false,	true,	false,	true,	true,	true,	true,	false,	// 0x20
		false,	false,	false,	true,	true,	true,	true,	true,
		true,	true,	true,	true,	true,	true,	true,	true,	// 0x30
		true,	true,	true,	true,	false,	true,	false,	true,
		true,	true,	true,	true,	true,	true,	true,	true,	// 0x40
		true,	true,	true,	true,	true,	true,	true,	true,
		true,	true,	true,	true,	true,	true,	true,	true,	// 0x50
		true,	true,	true,	false,	true,	false,	false,	true,
		false,	true,	true,	true,	true,	true,	true,	true,	// 0x60
		true,	true,	true,	true,	true,	true,	true,	true,
		true,	true,	true,	true,	true,	true,	true,	true,	// 0x70
		true,	true,	true,	false,	false,	false,	true,	false
	};
	length_t	cchUrl;

	if(!urlChars[static_cast<uchar>(pwsz[0])] || cch < 6)
		return 0;
	if(STARTS_WITH(6, L'f', L"ile://")
			|| STARTS_WITH(5, L'f', L"tp://")
			|| STARTS_WITH(8, L'g', L"opher://")
			|| STARTS_WITH(6, L'h', L"ttp://")
			|| STARTS_WITH(7, L'h', L"ttps://")
			|| STARTS_WITH(8, L'm', L"ailto://")
			|| STARTS_WITH(6, L'n', L"ews://")
			|| STARTS_WITH(6, L'n', L"ntp://")
			|| STARTS_WITH(8, L't', L"elnet://")
			|| STARTS_WITH(6, L'w', L"ais://")) {
		for(++cchUrl; cchUrl < cch; ++cchUrl) {
			if(pwsz[cchUrl] > 0x007F || !urlChars[static_cast<uchar>(pwsz[cchUrl])])
				return cchUrl;
		}
		return cch;
	}
	return 0;

#undef STARTS_WITH
}

/**
 *	w肳ꂽʂ̓Ԃ
 *	@param chBracket	ׂ銇
 *	@param chPair		[out] Ί
 *	@param bOpener		[out] <var>chBracket</var> JʂȂ true
 *	@return				<var>chBracket</var> gp\ȊʂɊ܂܂Ă true
 */
bool CLexer::GetBracketTraits(char_t chBracket, char_t& chPair, bool& bOpener) const {
	const char_t*	pwszFound = wcschr(m_pwszBrackets, chBracket);

	if(pwszFound == 0)
		return false;
	else if((pwszFound - m_pwszBrackets) % 2 == 0) {
		chPair = pwszFound[1];
		bOpener = true;
	} else {
		chPair = pwszFound[-1];
		bOpener = false;
	}
	return true;
}

/**
 *	L[[h̑啶ʂ邩ǂ̐ݒB
 *	ݒύXƓo^ĂL[[h͑Sč폜
 *	@param bIgnore	ʂȂꍇ true
 */
void CLexer::IgnoreCase(bool bIgnore) {
	AssertValid();
	if(bIgnore == m_bCaseSensitive) {
		m_bCaseSensitive = !bIgnore;
		m_keywords.clear();
		_NotifyChange();
	}
}

/**
 *	ʎq\𔻒肷B̃\bh
 *	Unicode  ID_Continue vpeB̃XgɊÂĂ
 *	@param cp	ׂR[h|Cg
 *	@return		ʎq\ǂ
 *	@see		CLexer::IsIdentifier, CLexer::IsIdentifierStartChar
 */
bool CLexer::IsIdentifierContinueCodePoint(CodePoint cp) const {
	AssertValid();

	if(m_bEnableUnicodeAlphabets)
		return IsIdentifierStartCodePoint(cp)
			|| binary_search(IDContinueCodePoints, _endof(IDContinueCodePoints), cp);
	else
		return IsIdentifierStartCodePoint(cp)
			|| (cp == L'_')
			|| (cp >= L'0' && cp <= L'9');
}

/**
 *	ʎqJn𔻒肷B̃\bh
 *	Unicode  ID_Start vpeB̃XgɊÂĂ
 *	@param cp	ׂR[h|Cg
 *	@return		ʎqJnǂ
 *	@see		CLexer::IsIdentifier, CLexer::IsIdentifierContinueChar
 */
bool CLexer::IsIdentifierStartCodePoint(CodePoint cp) const {
	AssertValid();

#if ASCENSION_UNICODE_VERSION != 0x0410
#error This code is based on old version of Unicode.
#endif
	if(binary_search(m_additionalAlphabets.begin(), m_additionalAlphabets.end(), cp))
		return true;
	if(m_bEnableUnicodeAlphabets)
		return binary_search(IDStartCodePoints, _endof(IDStartCodePoints), cp)
			|| (cp >= 0x1401 && cp <= 0x166C)		// Canadian Syllabics
			|| (cp >= 0x3400 && cp <= 0x4DB5)		// CJK Unified Ideograph
			|| (cp >= 0x4E00 && cp <= 0x9FBB)		// CJK Unified Ideograph
			|| (cp >= 0xA016 && cp <= 0xA48C)		// Yi Syllable
			|| (cp >= 0xAC00 && cp <= 0xD7A3)		// Hangul Syllable
			|| (cp >= 0x20000 && cp <= 0x2A6D6)		// CJK Unified Ideograph
			|| (cp >= 0x2F800 && cp <= 0x2FA1D);	// CJK Compatibility Ideograph
	else
		return (cp >= 'A' && cp <= 'Z') || (cp >= 'a' && cp <= 'z');
}

/**
 *	͂Ag[Nɕ
 *	@param str		
 *	@param nCookie	[in, out] ׂ镶̒OsRgłꍇ
 *					̃RgɊ֘AtꂽNbL[ݒ肵ĂB\bhďoA
 *					ׂ̖̕sRgɊ֘AtꂽNbL[̒lݒ肳
 *					(o͂ɂ NullCookie ͕sRgłȂƂ\̂Ɏg)
 *	@param tokens	[out] g[ÑXg
 */
void CLexer::Parse(const string_t& str, TokenCookie& nCookie, list<CToken>& tokens) const {
	AssertValid();

	CToken			token;
	const char_t*	pwsz = str.c_str();
	const length_t	cch = str.length();
	bool			bMCommentContinued;

	token.m_i = 0;
	if(nCookie != NullCookie) {	// ׂ镶̑O畡sRgĂꍇ
		token._SetCookie(nCookie);
		const length_t	cchToken = EatMultilineAnnotation(pwsz, cch,
			AC_ONLYSTARTOFLINE | AC_ONLYHEADOFLINE, nCookie, bMCommentContinued);
		token._SetType(TT_ANNOTATION);
		tokens.push_back(token);
		token.m_i = cchToken;
		if(!bMCommentContinued)
			nCookie = NullCookie;
	}
	token._SetCookie(NullCookie);

	length_t				i = token.m_i;
	length_t				cchToken;
	TokenCookie				nTokenCookie = NullCookie;
	AnnotationConstraint	annotationConstraint;
	bool					bAppearedUnspace = false;
	while(i < cch) {
		// ⑫K̓K
		annotationConstraint = AC_NONE;
		if(i == 0)				annotationConstraint |= AC_ONLYSTARTOFLINE;
		if(!bAppearedUnspace)	annotationConstraint |= AC_ONLYHEADOFLINE;

		if(m_enabledTokenTypes[TT_TAB] && pwsz[i] == L'\t') {	// ^u
			cchToken = 1;
			token._SetType(TT_TAB);
		} else if(m_enabledTokenTypes[TT_WHITESPACE]	// 󔒗ޕ
				&& 0 != (cchToken = EatWhiteSpaces(pwsz + i, cch - i, false)))
			token._SetType(TT_WHITESPACE);
		else if(m_enabledTokenTypes[TT_NUMBER]	// l
				&& 0 != (cchToken = EatNumbers(pwsz + i, cch - i)))
			token._SetType(TT_NUMBER);
		else if(m_enabledTokenTypes[TT_SINGLEQUOTATION]	// dp
				&& pwsz[i] == L'\''
				&& 0 != (cchToken = EatQuotation(pwsz + i, cch - i, m_bEscapeByBackSolidus)))
			token._SetType(TT_SINGLEQUOTATION);
		else if(m_enabledTokenTypes[TT_DOUBLEQUOTATION]	// dp
				&& pwsz[i] == L'\"'
				&& 0 != (cchToken = EatQuotation(pwsz + i, cch - i, m_bEscapeByBackSolidus)))
			token._SetType(TT_DOUBLEQUOTATION);
		else if(m_enabledTokenTypes[TT_OTHERQUOTATION]	// ̑̈p
				&& pwsz[i] != L'\'' && pwsz[i] != L'\"'
				&& 0 != (cchToken = EatQuotation(pwsz + i, cch - i, m_bEscapeByBackSolidus)))
			token._SetType(TT_OTHERQUOTATION);
		else if(m_enabledTokenTypes[TT_ANNOTATION]	// Ps
				&& 0 != (cchToken = EatSinglelineAnnotation(pwsz + i, cch - i, annotationConstraint, nTokenCookie))) {
			token._SetCookie(nTokenCookie);
			token._SetType(TT_ANNOTATION);
		} else if(m_enabledTokenTypes[TT_ANNOTATION]	// s
				&& 0 != (cchToken = EatMultilineAnnotation(pwsz + i,
				cch - i, annotationConstraint, nTokenCookie, bMCommentContinued))) {
			token._SetCookie(nTokenCookie);
			token._SetType(TT_ANNOTATION);
			if(bMCommentContinued)	// ̍sɑ
				nCookie = nTokenCookie;
		} else if(m_enabledTokenTypes[TT_OPERATOR]	// Zq
				&& 0 != (cchToken = EatOperators(pwsz + i, cch - i)))
			token._SetType(TT_OPERATOR);
		else if(m_enabledTokenTypes[TT_NUMERAL]	// 
				&& 0 != (cchToken = EatNumerals(pwsz + i, cch - i)))
			token._SetType(TT_NUMERAL);
		else if(m_enabledTokenTypes[TT_ASCII_CONTROL]	// ASCII 䕶
				&& 0 != (cchToken = EatAsciiControls(pwsz + i, cch - i)))
			token._SetType(TT_ASCII_CONTROL);
		else if(m_enabledTokenTypes[TT_UNICODE_CONTROL]	// Unicode 䕶
				&& EatUnicodeControls(pwsz + i, cch - i)) {
			cchToken = 1;
			token._SetType(TT_UNICODE_CONTROL);
		} else if(m_enabledTokenTypes[TT_IDENTIFIER]	// ʎq or L[[h (or )
				&& 0 != (cchToken = EatIdentifier(pwsz + i, cch - i))) {
			if(m_keywords.empty())
				token._SetType(TT_IDENTIFIER);
			else if(EatKeyword(pwsz + i, cchToken, nTokenCookie)) {
				token._SetCookie(nTokenCookie);
				token._SetType(TT_KEYWORD);
			} else
				token._SetType(TT_IDENTIFIER);
		} else {	// 
			CodePoint	cp;
			cchToken = 0;
			while(i + cchToken < cch) {
				cp = DecodeUtf16SurrogatesToCodePoint(pwsz + i, cch - i);
				cchToken += (cp > 0xFFFF) ? 2 : 1;
				if(CBoundarySearcher::IsGraphemeBase(cp))
					break;
			}
			token._SetType(TT_UNSPECIFIED);
		}

		if(tokens.empty() || token.GetType() != TT_UNSPECIFIED || token.GetType() != tokens.back().GetType()
				|| (!tokens.empty() && i - tokens.back().GetIndex() == 1	// LȊʂ1g[N
				&& (wcschr(m_pwszBrackets, pwsz[i - 1]) != 0 || wcschr(m_pwszBrackets, pwsz[i]) != 0))) {
			token.m_i = i;
			tokens.push_back(token);
			nTokenCookie = NullCookie;
			token._SetCookie(NullCookie);
		} else
			/* Aȃ̂߂Ɏނނ̃g[N1ɂ܂Ƃ߂ */;
		i += cchToken;

		if(token.GetType() != TT_WHITESPACE && token.GetType() != TT_TAB)
			bAppearedUnspace = true;
	}
}

/**
 *	̕s߂̏Ԃ𒲂ׂ
 *	@param str		
 *	@param nCookie	ׂ镶̒O瑱sRgɊ֘AtꂽNbL[B
 *					s߂łȂꍇ NullCookie
 *	@return			ׂ񂪕sIĂȂ΁A
 *					̕s߂Ɋ֘AtꂽNbL[BłȂ NullCookie
 */
TokenCookie CLexer::ParseMultilineAnnotation(const string_t& str, TokenCookie nCookie) const {
	AssertValid();

	if(str.empty())
		return nCookie;

	const char_t*	pwsz = str.data();
	const length_t	cch = str.length();
	length_t		i = 0;
	length_t		cchToken;
	TokenCookie		nDummy;
	bool			bMCommentContinued;

	// s߂O̍s瑱Ăꍇ
	if(nCookie != NullCookie) {
		i = EatMultilineAnnotation(pwsz, cch, true, nCookie, bMCommentContinued);
		if(bMCommentContinued)
			return nCookie;
		nCookie = NullCookie;
	}

	while(i < cch) {
		if(m_enabledTokenTypes[TT_ANNOTATION]
				&& 0 != (cchToken = EatSinglelineAnnotation(pwsz + i, cch - i, i == 0, nDummy))) {
			return NullCookie;
		} else if(m_enabledTokenTypes[TT_ANNOTATION]
				&& 0 != (cchToken = EatMultilineAnnotation(pwsz + i,
					cch - i, i == 0, nCookie, bMCommentContinued))) {
			if(bMCommentContinued)
				return nCookie;
			nCookie = NullCookie;
			i += cchToken;
		} else if(m_enabledTokenTypes[TT_SINGLEQUOTATION]
				&& 0 != (cchToken = EatQuotation(pwsz + i, cch - i, m_bEscapeByBackSolidus)))
			i += cchToken;
		else if(m_enabledTokenTypes[TT_DOUBLEQUOTATION]
				&& 0 != (cchToken = EatQuotation(pwsz + i, cch - i, m_bEscapeByBackSolidus)))
			i += cchToken;
		else if(m_enabledTokenTypes[TT_OTHERQUOTATION]
				&& 0 != (cchToken = EatQuotation(pwsz + i, cch - i, m_bEscapeByBackSolidus)))
			i += cchToken;
		else
			++i;
	}
	return NullCookie;
}

///	o^ĂL[[hA߁AZqSč폜
void CLexer::RemoveAll() {
	AssertValid();

	_ClearKeywords();
	m_multilineAnnotations.clear();
	m_singlelineAnnotationBs.clear();
	m_singlelineAnnotationDs.clear();
	m_operators.clear();
	_NotifyChange();
}

/**
 *	AddXXXX œo^g[N폜
 *	@param nCookie	g[ÑNbL[l
 */
void CLexer::RemoveIdentifiedToken(TokenCookie nCookie) throw(invalid_argument) {
	AssertValid();

	TokenType	type = TT_COUNT;

	_KeywordsMap::iterator	itKeywords = m_keywords.find(nCookie);
	if(itKeywords != m_keywords.end()) {
		delete itKeywords->second;
		m_keywords.erase(itKeywords);
		type = TT_KEYWORD;
	} else {
		_MAnnotationMap::iterator	itMAnnotations = m_multilineAnnotations.find(nCookie);
		if(itMAnnotations != m_multilineAnnotations.end()) {
			m_multilineAnnotations.erase(itMAnnotations);
			type = TT_ANNOTATION;
		} else {
			_SAnnotationBMap::iterator	itSBAnnotations = m_singlelineAnnotationBs.find(nCookie);
			if(itSBAnnotations != m_singlelineAnnotationBs.end()) {
				m_singlelineAnnotationBs.erase(itSBAnnotations);
				type = TT_ANNOTATION;
			} else {
				_SAnnotationDMap::iterator	itSDAnnotations = m_singlelineAnnotationDs.find(nCookie);
				if(itSDAnnotations != m_singlelineAnnotationDs.end()) {
					m_singlelineAnnotationDs.erase(itSDAnnotations);
					type = TT_ANNOTATION;
				}
			}
		}
	}
	if(type == TT_COUNT)
		throw invalid_argument("Specified cookie value is invalid.");
	if(m_pEventListener != 0)
		m_pEventListener->OnLexerRemovedIdentifiedToken(type, nCookie);
	_NotifyChange();
}

///	SĂ̐ݒ荀ڂԂɖ߂
void CLexer::Reset() {
	AssertValid();

	m_bCaseSensitive = true;
	m_bEscapeByBackSolidus = true;
	m_bEnableUnicodeAlphabets = true;
	m_bEnableUnicodeWhiteSpaces = true;
	m_numberFormat = NF_NUMERAL_FOLLOWED_BY_ALPHANUMERAL;
	fill(m_enabledTokenTypes, _endof(m_enabledTokenTypes), true);
	RemoveAll();
}

/**
 *	̃At@xbgȊOɁAP\ƂĂ݂ȂR[h|Cgݒ肷
 *	@param pwszAlphabets	ׂBLȃTQ[gyA͔ BMP ɕϊ
 *	@param cch				<var>pwszAlphabets</var> ̕
 */
void CLexer::SetAdditionalAlphabets(const char_t* pwszAlphabets, length_t cch) {
	AssertValid();

	m_additionalAlphabets.clear();
	for(length_t i = 0; i < cch; ++i) {
		const CodePoint	cp = DecodeUtf16SurrogatesToCodePoint(pwszAlphabets + i, cch - i);
		m_additionalAlphabets.insert(cp);
		if(cp > 0xFFFF)
			++i;
	}
	_NotifyChange();
}

/**
 *	̃At@xbgȊOɁAP\ƂĂ݂ȂR[h|Cgݒ肷
 *	@param alphabets	R[h|Cg̏W
 */
void CLexer::SetAdditionalAlphabets(const set<CodePoint>& alphabets) {
	AssertValid();
	m_additionalAlphabets = alphabets;
	_NotifyChange();
}

/**
 *	ʂƂĎgp镶ݒ肷
 *	@param pwszBrackets				Jʂׂ
 *	@throw std::invalid_argument	JʂƂĎgpłȂɊ܂܂ĂƂX[
 */
void CLexer::SetBrackets(const char_t* pwszBrackets) {
	AssertValid();
	assert(pwszBrackets != 0);

	ostringstream_t	ss;

	for(size_t i = 0; pwszBrackets[i] != 0; ++i) {
		for(size_t j = 0; ; ++j) {
			if(bracketPairs[j].first == 0 || bracketPairs[j].second == pwszBrackets[i])
				throw invalid_argument("Specified character can not be used as an opener.");
			else if(bracketPairs[j].first == pwszBrackets[i]) {
				ss << bracketPairs[j].first << bracketPairs[j].second;
				break;
			}
		}
	}
	const string_t	str = ss.str();
	delete[] m_pwszBrackets;
	m_pwszBrackets = new char_t[str.length() + 1];
	wcscpy(m_pwszBrackets, str.c_str());
}

/**
 *	Zqݒ肷
 *	@param operators	Zq̏W
 */
void CLexer::SetOperators(const set<string_t>& operators) {
	AssertValid();

	m_operators.clear();
	for(set<string_t>::const_iterator it = operators.begin(); it != operators.end(); ++it) {
		if(it->empty())
			continue;
		m_operators[it->at(0)].insert(*it);
	}
	_NotifyChange();
}

/* [EOF] */