#!/usr/bin/env perl
#
# Output a convertion table from multibyte character to UCS character
# Copyright (C) 2006  MIRACLE LINUX CORPORATION.
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

use strict;
use Encode;
use Encode::EUCJPMS;

my @singlebyte;
my @doublebyte_1st;
my @doublebyte_2nd;
my @triplebyte_1st;
my @triplebyte_2nd;
my @triplebyte_3rd;

sub is_singlebyte {
    my ($c1) = @_;
    return defined($singlebyte[$c1]);
}

sub is_doublebyte {
    my ($c1, $c2) = @_;
    return defined($doublebyte_1st[$c1]) && defined($doublebyte_2nd[$c2]);
}

sub is_triplebyte {
    my ($c1, $c2, $c3) = @_;
    return defined($triplebyte_1st[$c1]) && defined($triplebyte_2nd[$c2])
           && defined($triplebyte_3rd[$c3]);
}

sub print_mb2ucs_one_char {
    my ($mb, $ucs) = @_;
    my ($mblen, $ucslen);

    $mblen = length($mb);
    $ucslen = length($ucs);
    if ($mblen == 0) {
        $mblen = 1;
        $ucslen = 1;
    }

    for (my $i = 0; $i < $mblen; $i++) {
        printf "\\x%02X", ord(substr($mb, $i, 1));
    }
    print " ";
    for (my $i = 0; $i < $ucslen; $i++) {
        printf "<U%04X>", ord(substr($ucs, $i, 1));
    }
    print "\n";
}

sub dump_mb2ucs_1 {
    my $codeset = shift;
    my ($mb, $ucs, $u);

    for (my $c1 = 0x00; $c1 < 0x100; $c1++) {
        $mb = pack('C', $c1);
        $ucs = decode($codeset, $mb);
        $u = ord(substr($ucs, 0, 1));
        if ($u != 0xFFFD && !($c1 != 0 && $u == 0)) {
            print_mb2ucs_one_char($mb, $ucs);
            $singlebyte[$c1] = 1;
        }
    }
}

sub dump_mb2ucs_2 {
    my $codeset = shift;
    my ($mb, $ucs, $u);

    for (my $c1 = 0x80; $c1 < 0x100; $c1++) {
        next if (is_singlebyte($c1));
        for (my $c2 = 0x01; $c2 < 0x100; $c2++) {
            $mb = pack('CC', $c1, $c2);
            $ucs = decode($codeset, $mb);
            $u = ord(substr($ucs, 0, 1));
            if ($u != 0xFFFD && !($c1 != 0 && $u == 0)) {
                print_mb2ucs_one_char($mb, $ucs);
                $doublebyte_1st[$c1] = 1;
                $doublebyte_2nd[$c2] = 1;
            }
        }
    }
}

sub dump_mb2ucs_3 {
    my $codeset = shift;
    my ($mb, $ucs, $u);

    for (my $c1 = 0x80; $c1 < 0x100; $c1++) {
        next if (is_singlebyte($c1));
        for (my $c2 = 0x01; $c2 < 0x100; $c2++) {
            next if (is_doublebyte($c1, $c2));
            for (my $c3 = 0x01; $c3 < 0x100; $c3++) {
                $mb = pack('CCC', $c1, $c2, $c3);
                $ucs = decode($codeset, $mb);
                $u= ord(substr($ucs, 0, 1));
                if ($u != 0xFFFD && !($c1 != 0 && $u == 0)) {
                    print_mb2ucs_one_char($mb, $ucs);
                    $triplebyte_1st[$c1] = 1;
                    $triplebyte_2nd[$c2] = 1;
                    $triplebyte_3rd[$c3] = 1;
                }
            }
        }
    }
}
 
sub dump_mb2ucs_4 {
    my $codeset = shift;
    my ($mb, $ucs, $u);

    for (my $c1 = 0x80; $c1 < 0x100; $c1++) {
        next if (is_singlebyte($c1));
        for (my $c2 = 0x01; $c2 < 0x100; $c2++) {
            next if (is_doublebyte($c1, $c2));
            for (my $c3 = 0x01; $c3 < 0x100; $c3++) {
                next if (is_doublebyte($c1, $c2));
                for (my $c4 = 0x01; $c4 < 0x100; $c4++) {
                    $mb = pack('CCCC', $c1, $c2, $c3, $c4);
                    $ucs = decode($codeset, $mb);
                    $u = ord(substr($ucs, 0, 1));
                    if ($u != 0xFFFD && !($c1 != 0 && $u == 0)) {
                        print_mb2ucs_one_char($mb, $ucs);
                    }
                }
            }
        }
    }
}
 
sub dump_mb2ucs {
    my $codeset = shift;

    dump_mb2ucs_1($codeset);
    dump_mb2ucs_2($codeset);
    dump_mb2ucs_3($codeset);
}

my $codeset;

if ($#ARGV != 0) {
    printf STDERR "Usage: mb2ucs_perl codeset\n";
    exit 1;
}
$codeset = $ARGV[0];
if (!defined(find_encoding($codeset))) {
    print STDERR "Unknown encoding $codeset\n";
    exit 1;
}

dump_mb2ucs($codeset);

0;
