#!/usr/bin/env perl

use Encode;
use FileHandle;
use IPC::Open2;

my %mb_max = ( 
    'cp932'    => 2,
    'cp51932'  => 2,
    'eucjp-ms' => 3,
    'sjis'     => 2,
    'euc'      => 3,
);
my $re_sjis = '(?:[\x00-\x7F\xA1-\xDF]'
            . '|[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])';
my $re_euc  = '(?:[\x00-\x7F]'
            . '|[\x8E\xA1-\xFE][\xA1-\xFE]'
            . '|\x8F[\xA1-\xFE][\xA1-\xFE])';
my %re_mb_code_range = (
    'cp932'    => $re_sjis,
    'cp51932'  => $re_euc,
    'eucjp-ms' => $re_euc,
    'sjis'     => $re_sjis,
    'euc'      => $re_euc
);
my @singlebyte = ();
my @doublebyte_1st = ();
my @doublebyte_2nd = ();

sub nkf_decode {
    my ($codeset, $mb) = @_;
#    if (ord(substr($mb, 0, 1)) == 0x0A) {
#        print NKFIN "\n";
#        $utf8 = <NKFOUT>;
    if ($mb =~ /^[\x00-\x1F\x80-\xFF]$/s) {
        $mb =~ s/(.)/"\\x" . unpack('H*', $1)/sgex;
        $utf8 = `printf "$mb" | nkf -ux --ic=$codeset --oc=utf-8`;
    } else {
        print NKFIN $mb . "\n";
        $utf8 = <NKFOUT>;
        chomp($utf8);
    }
    return Encode::decode("utf-8", $utf8);
}

sub is_singlebyte {
    my ($c1) = @_;
    return defined($singlebyte[$c1]);
}

sub is_doublebyte {
    my ($c1, $c2) = @_;
    return defined($doublebyte_1st[$c1]) && defined($doublebyte_2nd[$c2]);
}

sub is_triplebyte {
    my ($c1, $c2, $c3) = @_;
    return defined($triplebyte_1st[$c1]) && defined($triplebyte_2nd[$c2])
           && defined($triplebyte_3rd[$c3]);
}

sub print_mb2ucs_one_char {
    my ($mb, $ucs) = @_;
    my ($mblen, $ucslen);

    $mblen = length($mb);
    $ucslen = length($ucs);
    if ($mblen == 0) {
        $mblen = 1;
        $ucslen = 1;
    }

    for (my $i = 0; $i < $mblen; $i++) {
        printf "\\x%02X", ord(substr($mb, $i, 1));
    }
    print " ";
    for (my $i = 0; $i < $ucslen; $i++) {
        printf "<U%04X>", ord(substr($ucs, $i, 1));
    }
    print "\n";
}

sub dump_mb2ucs_1 {
    my $codeset = shift;
    my ($mb, $ucs, $u);

    for (my $c1 = 0x00; $c1 < 0x100; $c1++) {
        $mb = pack('C', $c1);
	if ($mb !~ /^$re_mb_code_range{$codeset}$/o) {
            next;
        }
        $ucs = nkf_decode($codeset, $mb);
        $u = ord(substr($ucs, 0, 1));
        if (length($ucs) > 0) {
            print_mb2ucs_one_char($mb, $ucs);
            $singlebyte[$c1] = 1;
        }
    }
}

sub dump_mb2ucs_2 {
    my $codeset = shift;
    my ($mb, $ucs, $u);

    for (my $c1 = 0x80; $c1 < 0x100; $c1++) {
        next if (is_singlebyte($c1));
        for (my $c2 = 0x20; $c2 < 0x100; $c2++) {
            $mb = pack('CC', $c1, $c2);
	    if ($mb !~ /^$re_mb_code_range{$codeset}$/o) {
                next;
            }
            $ucs = nkf_decode($codeset, $mb);
            $u = ord(substr($ucs, 0, 1));
            if (length($ucs) > 0) {
                print_mb2ucs_one_char($mb, $ucs);
                $doublebyte_1st[$c1] = 1;
                $doublebyte_2nd[$c2] = 1;
            }
        }
    }
}

sub dump_mb2ucs_3 {
    my $codeset = shift;
    my ($mb, $ucs, $u);

    for (my $c1 = 0x80; $c1 < 0x100; $c1++) {
        next if (is_singlebyte($c1));
        for (my $c2 = 0x20; $c2 < 0x100; $c2++) {
            next if (is_doublebyte($c1, $c2));
            for (my $c3 = 0x20; $c3 < 0x100; $c3++) {
                $mb = pack('CCC', $c1, $c2, $c3);
                if ($mb !~ /^$re_mb_code_range{$codeset}$/o) {
                    next;
                }
                $ucs = nkf_decode($codeset, $mb);
                $u= ord(substr($ucs, 0, 1));
                if (length($ucs) > 0) {
                    print_mb2ucs_one_char($mb, $ucs);
                    $triplebyte_1st[$c1] = 1;
                    $triplebyte_2nd[$c2] = 1;
                    $triplebyte_3rd[$c3] = 1;
                }
            }
        }
    }
}
 
sub dump_mb2ucs {
    my $codeset = shift;

    dump_mb2ucs_1($codeset);
    if ($mb_max{$codeset} >= 2) {
	dump_mb2ucs_2($codeset);
        if ($mb_max{$codeset} >= 3) {
            dump_mb2ucs_3($codeset);
	}
    }
}

my $codeset;

if ($#ARGV != 0) {
    printf STDERR "Usage: mb2ucs codeset\n";
    exit 1;
}
$codeset = $ARGV[0];

my $pid = open2(\*NKFOUT, \*NKFIN, "nkf",
                                   "-ux",
                                   "--ic=${codeset}",
                                   "--oc=utf-8" );

dump_mb2ucs($codeset);

0;
