#!perl

# UnicodeCharClassify.pl
# (c) 2016-2017 JOJO
# update: 2017.09.20 scintilla 3.7.6
# create: 2016.06.20

#Usage: perl scintilla/scripts/UnicodeCharClassify.pl > scintilla/src/UnicodeCharClassifyData

use utf8;
use warnings;
use strict;
use File::Basename;

my $DEBUG = 0;

my $UnicodeData = File::Basename::dirname($0)."/UnicodeData.txt";	# 'scintilla/scripts/UnicodeData.txt'

# scintilla/src/CharClassify.h : enum cc { ccSpace, ccNewLine, ccWord, ccPunctuation };
my @scc = ('ccSpace', 'ccNewLine', 'ccWord', 'ccPunctuation');
my $ccSpace = 0;
my $ccNewLine = 1;
my $ccWord = 2;
my $ccPunctuation = 3;

my $BITSHIFT = 8;
my $BITMASK = (1 << $BITSHIFT) - 1;

my $icp0 = -1;
my $vcn0 = '';
my $icc0 = -1;

open FILE, '<:utf8', $UnicodeData or die $!;

print "/*
 * http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt
 */
static const int SHIFT = $BITSHIFT;
static const int MASK = $BITMASK;
static const int data[] = {\n";

while (not eof(FILE)) {
	# 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
	my $line = <FILE>;
	my @L = split /;/, $line;
	
	my $vcp = $L[0];	# 0. Code value
	my $vcn = $L[1];	# 1. Character name
	my $vgc = $L[2];	# 2. General Category
	die unless $vgc;

	my $icp = hex($vcp);
	my $icc;

	if (++$icp0 != $icp) {
	  unless ($vcn =~ m/, Last>$/) {
	# unless ($vcn0 =~ m/, First>$/ and $vcn =~ m/, Last>$/) {
		# code undefined.
		my $scp = sprintf('%04X', $icp0);
		$icc = $ccSpace;
		my $v = $icp0 << $BITSHIFT | $icc;
		print "$v,";
		print "\t// $scp, CharClassify::$scc[$icc]" if $DEBUG;
		print "\n";
		$icc0 = -1;
	  }
	}

	# scintilla/src/CharClassify.cxx: void CharClassify::SetDefaultCharClasses(bool includeWordClass) { ... }

	   if ($icp == 0x000A) {$icc = $ccNewLine}	# (Cc) <LF>
	elsif ($icp == 0x000D) {$icc = $ccNewLine}	# (Cc) <CR>
	elsif ($icp <= 0x0020) {$icc = $ccSpace}	# <TAB><FF><BS><ESC>...
	elsif ($icp == 0x005F) {$icc = $ccWord}		# (Pc) '_'
	elsif ($icp == 0xFF3F) {$icc = $ccWord}		# (Pc) '＿' #ifdef NIHONGO

	# scintilla v3.7.6
	# scintilla/src/Document.cxx: CharClassify::cc Document::WordCharacterClass(unsigned int ch) const { ... }

		# Separator, Line/Paragraph
	elsif ($vgc eq 'Zl')  {$icc = $ccNewLine}
	elsif ($vgc eq 'Zp')  {$icc = $ccNewLine}	# U+2029 PARAGRAPH SEPARATOR

		# Separator, Space
	elsif ($vgc eq 'Zs')  {$icc = $ccSpace}		# U+0020<SAPCE> U+00A0<NBSP> U+3000<　> ...(17)
		# Other
	elsif ($vgc eq 'Cc')  {$icc = $ccSpace}
	elsif ($vgc eq 'Cf')  {$icc = $ccSpace}
	elsif ($vgc eq 'Cs')  {$icc = $ccSpace}
	elsif ($vgc eq 'Co')  {$icc = $ccSpace}
	elsif ($vgc eq 'Cn')  {$icc = $ccSpace}

		# Letter
	elsif ($vgc eq 'Lu')  {$icc = $ccWord}
	elsif ($vgc eq 'Ll')  {$icc = $ccWord}
	elsif ($vgc eq 'Lt')  {$icc = $ccWord}		# ῼ
	elsif ($vgc eq 'Lm')  {$icc = $ccWord}		# 々〱〲〳〴〵〻ゝゞーヽヾｰﾞﾟ
	elsif ($vgc eq 'Lo')  {$icc = $ccWord}		# ぁあぃいぅうぇえぉおゐゑゔゕゖゟァアィイゥウェエォヰヱヴヵヶヷヸヹヺヿｧｨｩｪｫｱｲｳｴｵ
		# Number
	elsif ($vgc eq 'Nd')  {$icc = $ccWord}		# 0123456789０１２３４５６７８９
	elsif ($vgc eq 'Nl')  {$icc = $ccWord}		# ⅠⅡⅢⅰⅱⅲ
	elsif ($vgc eq 'No')  {$icc = $ccWord}		# ²³¹¼½¾①②③⑴⑵⑶
		# Mark - includes combining diacritics
	elsif ($vgc eq 'Mn')  {$icc = $ccWord}
	elsif ($vgc eq 'Mc')  {$icc = $ccWord}
	elsif ($vgc eq 'Me')  {$icc = $ccWord}

		# Punctuation
	elsif ($vgc eq 'Pc')  {$icc = $ccPunctuation}	# _‿⁀⁔︳︴﹍﹎﹏＿	TODO: ccWord
	elsif ($vgc eq 'Pd')  {$icc = $ccPunctuation}
	elsif ($vgc eq 'Ps')  {$icc = $ccPunctuation}
	elsif ($vgc eq 'Pe')  {$icc = $ccPunctuation}
	elsif ($vgc eq 'Pi')  {$icc = $ccPunctuation}
	elsif ($vgc eq 'Pf')  {$icc = $ccPunctuation}
	elsif ($vgc eq 'Po')  {$icc = $ccPunctuation}
		# Symbol
	elsif ($vgc eq 'Sm')  {$icc = $ccPunctuation}
	elsif ($vgc eq 'Sc')  {$icc = $ccPunctuation}
	elsif ($vgc eq 'Sk')  {$icc = $ccPunctuation}
	elsif ($vgc eq 'So')  {$icc = $ccPunctuation}

	else                  {$icc = $ccPunctuation}

	if ($icc != $icc0) {
		my $v = $icp << $BITSHIFT | $icc;
		print "$v,";
		print "\t// $vcp, CharClassify::$scc[$icc]" if $DEBUG;
		print "\n";
	}

	$icp0 = $icp;
	$vcn0 = $vcn;
	$icc0 = $icc;
}
print "};\n";
close FILE;
