#!/usr/bin/perl
# generate cp50221 mapping table
# Copyright (C) 2006  MIRACLE LINUX CORPORATION.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

use File::Basename;

$MAPDIR = dirname($0);

# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
$CP932TXT = "CP932.TXT";

($EUC, $UNI, $COM, $FB) = (0 .. 3);

sub sjis2euc {
	$sjis = shift;
	($s1, $s2) = ($sjis >> 8 & 0xFF, $sjis & 0xFF);
	if ($s1 == 0) {
		if ($s2 > 0x80) {
			return 0x8e00 + $s2;
		} else {
			return $s2;
		}
	}
	if ($s1 > 0xDF) {
		$s1 -= 0x40;
	}
	if ($s2 > 0x7E) {
		$s2--;
	}
	$c = ($s1 - 0x81) * 188 + $s2 - 0x40;
	$e1 = int($c / 94) + 0xa1;
	$e2 = $c % 94 + 0xa1;
	return $e1 * 256 + $e2;
}

open(F, "$MAPDIR/$CP932TXT") or die;

while (<F>) {
	if ($_ =~ /^#/) { next; }
	($sjis, $uni, $com) = split(/\t+/, $_);
	if ($uni =~ /^ +/) { next; }

	$uni = hex(substr($uni, 2, 4));
	$sjis = hex(substr($sjis, 2));
	next if ($sjis >= 0xF040);

	chomp($com);
	push(@tbl, [&sjis2euc($sjis), $uni, $com, 0]);

	if ($sjis >= 0x00 && $sjis <= 0xFF
	    || $sjis >= 0x8140 && $sjis <= 0x84FC
	    || $sjis >= 0x889F && $sjis <= 0xEAFC) {
		$u2e{$uni} = &sjis2euc($sjis);
	} elsif (!defined($u2e{$uni})
	    && ($sjis >= 0x8740 && $sjis <= 0x879E
	        ||  $sjis >= 0xED40 && $sjis <= 0xEEFC)) {
		$u2e{$uni} = &sjis2euc($sjis);
	} else {
		$tbl[$#tbl][$FB] = 3;
	}
}
close(F);

@tbl = sort
{
	if ($a->[$UNI] == $b->[$UNI]) {
		if ($u2e{$a->[$UNI]} == $a->[$EUC]) {
			-1;
		} else {
			1;
		}
	} elsif ($a->[$UNI] > $b->[$UNI]) {
		1;
	} else {
		-1;
	}
} @tbl;

sub print_usascii {
	print <<'END';
<code_set_name> "cp50221-ascii"
<mb_cur_min> 1
<mb_cur_max> 1
<subchar> \x3F
CHARMAP
END

	for ($i = 0; $i <= $#tbl; $i++) {
		$u   = $tbl[$i][$UNI];
		$c  = $tbl[$i][$EUC] & 0x7F;
		if ($c == 0x1B || $c == 0x0E || $c == 0x0F) {
			$fb  = 1;
		} else {
			$fb  = $tbl[$i][$FB];
		}
		$com = substr($tbl[$i][$COM], 1);

		if ($tbl[$i][$EUC] < 0x80) {
			printf("<U%04X> \\x%02X |%d # %s\n", $u, $c, $fb, $com);
		}
	}
	print "END CHARMAP\n";
}

sub print_jisx0201_latin {
	print <<'END';
<code_set_name> "cp50221-jisx0201-latin"
<mb_cur_min> 1
<mb_cur_max> 1
<subchar> \x3F
CHARMAP
END

	for ($i = 0; $i <= $#tbl; $i++) {
		$u   = $tbl[$i][$UNI];
		$c  = $tbl[$i][$EUC] & 0x7F;
		if ($c == 0x1B || $c == 0x0E || $c == 0x0F) {
			next;
		} else {
			$fb  = 3;
		}
		$com = substr($tbl[$i][$COM], 1);

		if ($tbl[$i][$EUC] < 0x80 && $tbl[$i][$EUC] != 0x1B) {
			printf("<U%04X> \\x%02X |%d # %s\n", $u, $c, $fb, $com);
		}
	}
	print "END CHARMAP\n";
}

sub print_jisx0201_kana {
	$esc = shift;
	if ($esc == 1) {
		print "<code_set_name> \"cp50221-jisx0201-kana\"\n";
	} else {
		print "<code_set_name> \"cp50221-jisx0201-kana-sosi\"\n";
	}
	print <<'END';
<mb_cur_min> 1
<mb_cur_max> 1
<subchar> \x3F
CHARMAP
END

	for ($i = 0; $i <= $#tbl; $i++) {
		$u   = $tbl[$i][$UNI];
		$c  = $tbl[$i][$EUC] & 0x7F;
		if ($esc == 1) {
			$fb  = $tbl[$i][$FB];
		} else {
			$fb  = 3;
		}
		$com = substr($tbl[$i][$COM], 1);

		if ($tbl[$i][$EUC] >= 0x8EA1 && $tbl[$i][$EUC] <= 0x8EDF) {
			printf("<U%04X> \\x%02X |%d # %s\n", $u, $c, $fb, $com);
		}
	}
	print "END CHARMAP\n";
}

sub make_private_area_mapping {
    my ($before, $current, $fb) = @_;
    my ($c1, $c2, $u, $ret);

    $ret = 1;
    if ($before < 0xE000 && $current >= 0xF900) {
        for ($c1 = 0x7F; $c1 <= 0x92; $c1++) {
            for ($c2 = 0x21; $c2 <= 0x7E; $c2++) {
                $u = ($c1 - 0x7F) * 94 + ($c2 - 0x21) + 0xE000;
                printf("<U%04X> \\x%02X\\x%02X |%d # PRIVATE USE AREA\n",
                       $u, $c1, $c2, $fb);
            }
        }
        $ret = 0;
    }
    return $ret;
}

sub print_jisx0208_ms {
	$jis83 = shift;
	if ($jis83 == 1) {
		print "<code_set_name> \"cp50221-jisx0208-1983\"\n"
	} else {
		print "<code_set_name> \"cp50221-jisx0208-1978\"\n"
	}
	print <<'END';
<mb_cur_min> 1
<mb_cur_max> 2
<subchar> \x3F
CHARMAP
END
	$private_use_area = 1;
	$before = 0;
	for ($i = 0; $i <= $#tbl; $i++) {
		$u   = $tbl[$i][$UNI];
		$c1  = ($tbl[$i][$EUC] >> 8) & 0x7F;
		$c2  = $tbl[$i][$EUC] & 0x7F;
		if ($jis83 == 1) {
			$fb  = $tbl[$i][$FB];
		} else {
			$fb  = 3;
		}
		$com = substr($tbl[$i][$COM], 1);

		if ($private_use_area) {
			$private_use_area = &make_private_area_mapping($before, $u, $fb);
		}
		if ($tbl[$i][$EUC] >= 0xA1A1) {
			printf("<U%04X> \\x%02X\\x%02X |%d # %s\n", $u, $c1, $c2, $fb, $com);
		}
		$before = $u;
	}
	print "END CHARMAP\n";
}

if ($ARGV[0] =~ "us-ascii") {
  &print_usascii();
} elsif ($ARGV[0] =~ "jisx0201-latin") {
  &print_jisx0201_latin();
} elsif ($ARGV[0] =~ "jisx0201-kana-sosi") {
  &print_jisx0201_kana(0);
} elsif ($ARGV[0] =~ "jisx0201-kana") {
  &print_jisx0201_kana(1);
} elsif ($ARGV[0] =~ "jisx0208-1978") {
  &print_jisx0208_ms(0);
} elsif ($ARGV[0] =~ "jisx0208-1983") {
  &print_jisx0208_ms(1);
} else {
  print <<END
Usage: gen_cp50221ucm.pl name
  name
    us-ascii:
      Escape Sequence: ESC ( B
      Character Set  : US-ASCII (ISO/IEC 646 IRV)
    jisx0201-latin:
      Escape Sequence: ESC ( J
      Character Set  : JIS X 0201 Latin
    jisx0201-kana:
      Escape Sequence: ESC ( I
      Character Set  : JIS X 0201 Katakana
    jisx0201-kana-sosi:
      Control Code   : SO/SI
      Character Set  : JIS X 0201 Katakana
    jisx0208-1978:
      Escape Sequence: ESC \$ @
      Character Set  : JIS X 0208:1997
                       NEC special characters (Row 13)
                       NEC selection of IBM extensions (Rows 89 to 92)
                       User Defined Character (Rows 95 to 114)
    jisx0208-1983:
      Escape Sequence: ESC \$ B
      Character Set  : JIS X 0208:1997
                       NEC special characters (Row 13)
                       NEC selection of IBM extensions (Rows 89 to 92)
                       User Defined Character (Rows 95 to 114)
END
}
