#!/usr/local/bin/perl

# props.pl (c) 2005-2006 exeal
#
# This script generate C++ code (3 files) for CharProperty class implementation.
# That includes:
# - CharProperty_Table
# - CharProperty_Definition
# - CharProperty_Implementation

use strict;
use integer;

my %categoryMap = (
	Lu => 0, Ll => 1, Lt => 2, Lm => 3, Lo => 4,
	Mn => 5, Mc => 6, Me => 7,
	Nd => 8, Nl => 9, No => 10,
	Pc => 11, Pd => 12, Ps => 13, Pe => 14, Pi => 15, Pf => 16, Po => 17,
	Sm => 18, Sc => 19, Sk => 20, So => 21,
	Zs => 22, Zl => 23, Zp => 24,
	Cc => 25, Cf => 26, Cs => 27, Co => 28, Cn => 29
);

# process input files' directory
die "usage: props.pl [input-file-directory]\n" if($#ARGV > 0);
my $directory = shift @ARGV;
if($directory ne '') {
	$directory =~ s/\//\\/;
	$directory .= '\\' unless($directory =~ /\\$/);
}


# generate internal structural code
sub defineInternalStructures() {
print OUTPUT_DEF <<'END_OF_DEFINITION';
template<class Code> struct CodeRange {
	Code first, last;
	bool operator <(Code rhs) const {return first < rhs;}
};
struct PropertyRange {
	CodePoint first, last;
	uchar property;
	bool operator <(CodePoint rhs) const {return first < rhs;}
};
template<class Element> static const Element* findInRange(const Element* first, const Element* last, CodePoint cp) {
	const Element* p = std::lower_bound(first, last, cp);
	if(p == last) return 0;
	else if(p->first == cp) return p;
	else if(p->first > cp && p != first && p[-1].last >= cp) return p - 1;
	else return 0;
}
END_OF_DEFINITION
}

# generate general categories code
sub processGeneralCategories() {
	open(INPUT, $directory . 'UnicodeData.txt')
		or die "Input file '${directory}UnicodeData.txt' not found.\n";
	print 'generating general categories table...' . "\n";

	print OUTPUT_DEF 'static const PropertyRange categories_[];' . "\n";
	print OUTPUT_DEF 'static const std::size_t categoryCount_;' . "\n";
	print OUTPUT_TABLE 'const CharProperty::PropertyRange CharProperty::categories_[] = {' . "\n";

	my ($first, $last) = (0, -1);
	my $continuedGc = 'Cc';

	while(<INPUT>) {
		next unless(/^([\dA-Fa-f]+)\;(.+?)\;(\w\w)/);
		my ($cp, $gc) = (hex $1, $3);
		my $isRange = ($2 =~ /^\<.+?First\>$/) ? 1 : 0;

		if($cp != $last + 1 or $gc ne $continuedGc) {	# not continued
			printf OUTPUT_TABLE '{0x%X,0x%X,%d},', $first, $last, $categoryMap{$continuedGc};
			$first = $cp;
			$continuedGc = $gc;
			if(!$isRange) {$last = $first;}
			else {
				my $nextLine = readline *INPUT;
				$nextLine =~ /^([\dA-Fa-f]+)/;
				$last = hex $1;
			}
		} else {	# continued
			if(!$isRange) {++$last;}
			else {
				my $nextLine = readline *INPUT;
				$nextLine =~ /^([\dA-Fa-f]+)/;
				$last = hex $1;
			}
		}
	}
	printf OUTPUT_TABLE "{0x%X,0x%X,%d}};\n", $first, $last, $categoryMap{$continuedGc};
	print OUTPUT_TABLE 'const size_t CharProperty::categoryCount_ = countof(CharProperty::categories_);' . "\n";
	print OUTPUT_IMPL <<'END_OF_GET_CATEGORY';
inline CharProperty::GeneralCategory CharProperty::getGeneralCategory(CodePoint cp) {
	if(const PropertyRange* p = findInRange(categories_, categories_ + categoryCount_, cp))
		return static_cast<GeneralCategory>(p->property);
	else
		return GC_OTHER_NOT_ASSIGNED;
}
END_OF_GET_CATEGORY
	close INPUT;
}

# generate blocks code
sub processCodeBlocks() {
	open(INPUT, $directory . 'Blocks.txt')
		or die "Input file '${directory}Blocks.txt' not found.\n";
	print 'generating blocks table...' . "\n";

	print OUTPUT_DEF 'static const PropertyRange blocks_[];' . "\n";
	print OUTPUT_DEF 'static const std::size_t blockCount_;' . "\n";
	print OUTPUT_TABLE 'const CharProperty::PropertyRange CharProperty::blocks_[] = {' . "\n";

	my $blockNumber = 0;	# CharProperty::CB_BASIC_LATIN
	while(<INPUT>) {
		next unless(/([\dA-Fa-f]+)\.\.([\dA-Fa-f]+)/);
		printf OUTPUT_TABLE '{0x%X,0x%X,%d},', hex($1), hex($2), $blockNumber++;
	}
	close INPUT;
	print OUTPUT_TABLE "};\n";
	print OUTPUT_TABLE 'const size_t CharProperty::blockCount_ = countof(CharProperty::blocks_);' . "\n";
	print OUTPUT_IMPL <<'END_OF_GET_BLOCK';
inline CharProperty::CodeBlock CharProperty::getCodeBlock(CodePoint cp) {
	if(const PropertyRange* p = findInRange(blocks_, blocks_ + blockCount_, cp))
		return static_cast<CodeBlock>(p->property);
	else
		return CB_NOT_ASSIGNED;
}
END_OF_GET_BLOCK
}

# generate scripts code
sub processScripts() {
	open(INPUT, $directory . 'Scripts.txt')
		or die "Input file '${directory}Scripts.txt' not found.\n";
	print 'generating scripts table...' . "\n";

	print OUTPUT_DEF 'static const PropertyRange scripts_[];' . "\n";
	print OUTPUT_DEF 'static const std::size_t scriptCount_;' . "\n";
	print OUTPUT_TABLE 'const CharProperty::PropertyRange CharProperty::scripts_[] = {' . "\n";

	my @ranges;
	my $scriptNumber = 0;	# CharProperty::S_COMMON
	my ($first, $last) = (0, -1);
	while(<INPUT>) {
		if(/^([\dA-Fa-f]+)\s+\;/) {	# singleton
			my $cp = hex $1;
			if($cp == $last + 1) {++$last;}
			else {
				my %newEntry = ('first' => $first, 'last' => $last, 'script' => $scriptNumber);
				push @ranges, \%newEntry;
				$first = $last = $cp;
			}
		} elsif(/^([\dA-Fa-f]+)\.\.([\dA-Fa-f]+)\s+\;/) {	# range
			my ($begin, $end) = (hex $1, hex $2);
			if($begin == $last + 1) {$last = $end;}
			else {
				my %newEntry = ('first' => $first, 'last' => $last, 'script' => $scriptNumber);
				push @ranges, \%newEntry;
				($first, $last) = ($begin, $end);
			}
		} elsif(/^\# Total/) {	# end of section
			my %newEntry = ('first' => $first, 'last' => $last, 'script' => $scriptNumber);
			push @ranges, \%newEntry;
			++$scriptNumber;
		}
	}
	@ranges = sort {$a->{first} <=> $b->{first}} @ranges;
	foreach(@ranges) {
		printf OUTPUT_TABLE '{0x%X,0x%X,%d},', $_->{first}, $_->{last}, $_->{script};
	}
	print OUTPUT_TABLE "};\n";
	print OUTPUT_TABLE 'const size_t CharProperty::scriptCount_ = countof(CharProperty::scripts_);' . "\n";
	print OUTPUT_IMPL <<'END_OF_GET_SCRIPT';
inline CharProperty::Script CharProperty::getScript(CodePoint cp) {
	if(const PropertyRange* p = findInRange(scripts_, scripts_ + scriptCount_, cp))
		return static_cast<Script>(p->property);
	else
		return S_COMMON;
}
END_OF_GET_SCRIPT
	close INPUT;
}

# generate binary properties code
sub processBinaryProperties() {
	open(INPUT, $directory . 'PropList.txt')
		or die "Input file '${directory}PropList.txt' not found.\n";
	print 'generating binary properties table...' . "\n";

	my @ranges;
	my $readPoints = 0;
	my $continuedPoints = 0;
	my $ucs4 = 0;
	my $propertyName;
	while(<INPUT>) {
		$propertyName = $1 if($readPoints == 0 and /^[\da-fA-F\.\s]+\;\s+(\w+)/);
		if(/^([\dA-Fa-f]+)\s+/) {	# singleton
			my $cp = hex $1;
			push @ranges, ($cp, $cp);
			++$readPoints;
			$ucs4 = 1 if($cp > 0xFFFF);
		} elsif(/^([\dA-Fa-f]+)\.\.([\dA-Fa-f]+)\s+/) {	# range
			my ($first, $last) = (hex $1, hex $2);
			push @ranges, ($first, $last);
			$readPoints += $last - $first + 1;
			$continuedPoints += $last - $first + 1;
			$ucs4 = 1 if($last > 0xFFFF);
		} elsif(/^\# Total/) {	# end of section
			if($continuedPoints * 3 / 2 < $readPoints) {	# simple array
				my $i;
				print OUTPUT_DEF $ucs4 ? 'static const CodePoint ' : 'static const char_t ';
				print OUTPUT_DEF 'tableOfBp__' . $propertyName . "_[];\n";
				print OUTPUT_TABLE $ucs4 ? 'const CodePoint ' : 'const char_t ';
				print OUTPUT_TABLE 'CharProperty::tableOfBp__' . $propertyName . '_[] = {';
				for($i = 0; $i <= $#ranges; $i += 2) {
					printf OUTPUT_TABLE '0x%04X,', $ranges[$i];
				}
				print OUTPUT_IMPL 'template<> inline bool CharProperty::hasBinaryProperty<CharProperty::BP_' . uc($propertyName) . '>(CodePoint cp) {';
				print OUTPUT_IMPL 'return std::binary_search(tableOfBp__' . $propertyName
					. '_,tableOfBp__' . $propertyName
					. '_+' . ($#ranges + 1) / 2 . ','
					. ($ucs4 ? 'cp);' : 'static_cast<char_t>(cp));')
					. "}\n";
			} else {	# range based array
				my $i;
				print OUTPUT_DEF $ucs4 ? 'static const CodeRange<CodePoint> ' : 'static const CodeRange<char_t> ';
				print OUTPUT_DEF 'tableOfBp__' . $propertyName . "_[];\n";
				print OUTPUT_TABLE $ucs4 ? 'const CharProperty::CodeRange<CodePoint> ' : 'const CharProperty::CodeRange<char_t> ';
				print OUTPUT_TABLE 'CharProperty::tableOfBp__' . $propertyName . '_[] = {';
				for($i = 0; $i <= $#ranges; $i += 2) {
					printf OUTPUT_TABLE '{0x%04X,0x%04X},', $ranges[$i], $ranges[$i + 1];
				}
				print OUTPUT_IMPL 'template<> inline bool CharProperty::hasBinaryProperty<CharProperty::BP_' . uc($propertyName) . '>(CodePoint cp) {';
				print OUTPUT_IMPL 'return findInRange(tableOfBp__' . $propertyName
					. '_,tableOfBp__' . $propertyName
					. '_+' . ($#ranges + 1) . ',cp)!=0;'
					. "}\n";
			}
			print OUTPUT_TABLE "};\n";
			@ranges = ();
			$readPoints = $continuedPoints = 0;
			$ucs4 = 0;
		}
	}
	close INPUT;
	print OUTPUT_IMPL <<'END_OF_DERIVED_PROPERTIES';
template<> inline bool CharProperty::hasBinaryProperty<CharProperty::BP_ALPHABETIC>(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc == GC_LETTER_UPPERCASE
		|| gc == GC_LETTER_LOWERCASE
		|| gc == GC_LETTER_TITLECASE
		|| gc == GC_LETTER_OTHER
		|| gc == GC_NUMBER_LETTER
		|| hasBinaryProperty<BP_OTHER_ALPHABETIC>(cp);
}
template<> inline bool CharProperty::hasBinaryProperty<CharProperty::BP_DEFAULT_IGNORABLE_CODE_POINT>(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return (gc == GC_OTHER_FORMAT
		|| gc == GC_OTHER_CONTROL
		|| gc == GC_OTHER_SURROGATE
		|| hasBinaryProperty<BP_OTHER_DEFAULT_IGNORABLE_CODE_POINT>(cp)
		|| hasBinaryProperty<BP_NONCHARACTER_CODE_POINT>(cp))
		&& !hasBinaryProperty<BP_WHITE_SPACE>(cp);
}
template<> inline bool CharProperty::hasBinaryProperty<CharProperty::BP_LOWERCASE>(CodePoint cp) {
	return getGeneralCategory(cp) == GC_LETTER_LOWERCASE || hasBinaryProperty<BP_OTHER_LOWERCASE>(cp);
}
template<> inline bool CharProperty::hasBinaryProperty<CharProperty::BP_GRAPHEME_EXTEND>(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc == GC_MARK_ENCLOSING
		|| gc == GC_MARK_NONSPACING
		|| hasBinaryProperty<BP_OTHER_GRAPHEME_EXTEND>(cp);
}
template<> inline bool CharProperty::hasBinaryProperty<CharProperty::BP_GRAPHEME_BASE>(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc != GC_OTHER_CONTROL
		&& gc != GC_OTHER_FORMAT
		&& gc != GC_OTHER_SURROGATE
		&& gc != GC_OTHER_PRIVATE_USE
		&& gc != GC_OTHER_NOT_ASSIGNED
		&& gc != GC_SEPARATOR_LINE
		&& gc != GC_SEPARATOR_PARAGRAPH
		&& !hasBinaryProperty<BP_GRAPHEME_EXTEND>(cp);
}
template<> inline bool CharProperty::hasBinaryProperty<CharProperty::BP_MATH>(CodePoint cp) {
	return getGeneralCategory(cp) == GC_SYMBOL_MATH || hasBinaryProperty<BP_OTHER_MATH>(cp);
}
template<> inline bool CharProperty::hasBinaryProperty<CharProperty::BP_UPPERCASE>(CodePoint cp) {
	return getGeneralCategory(cp) == GC_LETTER_UPPERCASE || hasBinaryProperty<BP_OTHER_UPPERCASE>(cp);
}
inline bool CharProperty::isLetter(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc >= GC_LETTER_UPPERCASE && gc < GC_MARK_NONSPACING;
}
inline bool CharProperty::isMark(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc >= GC_MARK_NONSPACING && gc < GC_NUMBER_DECIMAL_DIGIT;
}
inline bool CharProperty::isNumber(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc >= GC_NUMBER_DECIMAL_DIGIT && gc < GC_SYMBOL_MATH;
}
inline bool CharProperty::isSymbol(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc >= GC_SYMBOL_MATH && gc < GC_PUNCTUATION_CONNECTOR;
}
inline bool CharProperty::isPunctuation(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc >= GC_PUNCTUATION_CONNECTOR && gc < GC_SEPARATOR_SPACE;
}
inline bool CharProperty::isSeparator(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc >= GC_SEPARATOR_SPACE && gc < GC_OTHER_CONTROL;
}
inline bool CharProperty::isOther(CodePoint cp) {
	const GeneralCategory gc = getGeneralCategory(cp);
	return gc >= GC_OTHER_CONTROL && gc <= GC_OTHER_NOT_ASSIGNED;
}
END_OF_DERIVED_PROPERTIES
}

# generate case-folding code
sub processCaseFolding() {
	open(INPUT, $directory . 'CaseFolding.txt')
		or die "Input file '${directory}CaseFolding.txt' not found.\n";
	print 'generating case-folding table...' . "\n";

	my (@cased, @folded);
	while(<INPUT>) {
		next unless(/([\dA-Fa-f]+)\;\s+[CS]\;\s+([\dA-Fa-f]+)/);
		push @cased, hex($1);
		push @folded, hex($2);
	}

	my ($i, $ucs2Count);
	print OUTPUT_TABLE 'const char_t CharacterFolder::CASED_UCS2[] = {' . "\n";
	for($i = 0; $i <= $#cased; ++$i) {
		if($cased[$i] >= 0x10000) {$ucs2Count = $i; last;}
		printf OUTPUT_TABLE '0x%X,', $cased[$i];
	}
	print OUTPUT_TABLE "};\n" . 'const CodePoint CharacterFolder::CASED_UCS4[] = {' . "\n";
	for(; $i <= $#cased; ++$i) {printf OUTPUT_TABLE '0x%X,', $cased[$i];}
	print OUTPUT_TABLE "};\n" . 'const char_t CharacterFolder::FOLDED_UCS2[] = {' . "\n";
	for($i = 0; $i <= $#folded; ++$i) {
		last if($folded[$i] >= 0x10000);
		printf OUTPUT_TABLE '0x%X,', $folded[$i];
	}
	print OUTPUT_TABLE "};\n" . 'const CodePoint CharacterFolder::FOLDED_UCS4[] = {' . "\n";
	for(; $i <= $#folded; ++$i) {printf OUTPUT_TABLE '0x%X,', $folded[$i];}
	print OUTPUT_TABLE "};\n";

	print OUTPUT_IMPL <<"END_OF_CASE_FOLDING";
inline CodePoint CharacterFolder::foldCase(CodePoint cp) {
	if(cp < 0x10000) {
		const char_t* const p = std::lower_bound(CASED_UCS2, CASED_UCS2 + $ucs2Count, static_cast<char_t>(cp));
		return (*p == cp) ? FOLDED_UCS2[p - CASED_UCS2] : cp;
	} else {
		const CodePoint* const p = std::lower_bound(CASED_UCS4, CASED_UCS4 + ($#cased + 1 - $ucs2Count), cp);
		return (*p == cp) ? FOLDED_UCS4[p - CASED_UCS4] : cp;
	}
}
END_OF_CASE_FOLDING

	close INPUT;
}

# generate NFD code
sub processNFD() {
	open(INPUT, $directory . 'UnicodeData.txt')
		or die "Input file '${directory}UnicodeData.txt' not found.\n";
	print 'generating NFD table...' . "\n";

	my (@src, @nfd);
	while(<INPUT>) {
		next unless(m/^([\dA-Fa-f]+)\;[^;]+\;[^;]+\;[^;]+\;[^;]+\;([\w\s]+)\;/);
		push @src, hex($1);
		my @pair = split(' ', $2);
		if($#pair == 0) {	# one-to-one mapping
			push @nfd, hex($pair[0]);
		} else {	# one-to-two mapping
			$pair[0] = hex($pair[0]);
			$pair[1] = hex($pair[1]);
			# this hack works at this time (Unicode 4.1)...
			# a SMP character is to two SMP characters
			# a SIP character is to one character
			($pair[0] -= 0x10000), ($pair[1] -= 0x10000) if($src[$#src] >= 0x10000 and $src[$#src] < 0x20000);
			push @nfd, $pair[0] << 16 | $pair[1];
		}
	}
	close INPUT;

	my $i;
	print OUTPUT_DEF 'static const CodePoint NFD_SRC[];';
	print OUTPUT_DEF 'static const CodePoint NFD_RES[];';
	print OUTPUT_TABLE 'const CodePoint CharProperty::NFD_SRC[] = {';
	for($i = 0; $i <= $#src; ++$i) {
		printf OUTPUT_TABLE '0x%X,', $src[$i];
	}
	print OUTPUT_TABLE "};\n";
	print OUTPUT_TABLE 'const CodePoint CharProperty::NFD_RES[] = {';
	for($i = 0; $i <= $#nfd; ++$i) {
		printf OUTPUT_TABLE '0x%X,', $nfd[$i];
	}
	print OUTPUT_TABLE "};\n";
	print OUTPUT_IMPL <<"END_OF_NFD";
inline std::size_t CharProperty::toNFD(CodePoint cp, CodePoint& first, CodePoint& second) {
	const CodePoint* const p = std::lower_bound(NFD_SRC, NFD_SRC + $#src - 1, cp);
	if(*p != cp)
		return (first = cp), 1;
	const CodePoint res = NFD_RES[p - NFD_SRC];
	if(res <= 0x10FFFF)
		return (first = res), 1;
	else if(cp >= 0x10000 && cp < 0x20000)
		return (first = (res >> 16) + 0x10000), (second = (res & 0xFFFF) + 0x10000), 2;
	else
		return (first = (res >> 16)), (second = (res & 0xFFFF)), 2;
}
inline length_t CharProperty::toNFD(CodePoint cp, char_t* dest) {
	CodePoint first, second;
	if(toNFD(cp, first, second) == 1)
		return UTF16Surrogates::encode(first, dest) ? 2 : 1;
	else {
		const length_t c = UTF16Surrogates::encode(first, dest) ? 2 : 1;
		return c + UTF16Surrogates::encode(second, dest + c) ? 2 : 1;
	}
}
END_OF_NFD
}


# open output files
my $header = '// automatically generated by props.pl at $ ' . scalar(localtime) . " \$\n";
die "Cannot open output file '../UnicodeProperties_Table'.\n" unless(open OUTPUT_TABLE, '>..\\UnicodeProperties_Table');
print OUTPUT_TABLE $header;
die "Cannot open output file '../UnicodeProperties_Def'.\n" unless(open OUTPUT_DEF, '>..\\UnicodeProperties_Definition');
print OUTPUT_DEF $header;
die "Cannot open output file '../UnicodeProperties_Impl'.\n" unless(open OUTPUT_IMPL, '>..\\UnicodeProperties_Implementation');
print OUTPUT_IMPL $header;

# version diagnostics
print OUTPUT_TABLE '#if ASCENSION_UNICODE_VERSION != 0x0410' . "\n";
print OUTPUT_TABLE '#error These code are based on old version of Unicode.' . "\n";
print OUTPUT_TABLE '#endif' . "\n";

# process all!
defineInternalStructures();
processGeneralCategories();
processCodeBlocks();
processScripts();
processBinaryProperties();
processCaseFolding();
processNFD();
print "done.\n";

close OUTPUT_TABLE;
close OUTPUT_DEF;
close OUTPUT_IMPL;

__END__