#!/usr/bin/env php
<?php

include 'ucs4lib.php';

$singlebyte = array_fill(0,256,0);
$doublebyte_1st = array_fill(0,256,0);
$doublebyte_2nd = array_fill(0,256,0);

function is_singlebyte($c)
{
    global $singlebyte;
    return $singlebyte[$c];
}

function is_doublebyte($c1, $c2)
{
    global $doublebyte_1st, $doublebyte_2nd;
    return $doublebyte_1st[$c1] && $doublebyte_2nd[$c2];
}

function print_mb2ucs_one_char($mb, $ucs_bin)
{
    $mblen = strlen($mb);
    $ucs = ucs4ord($ucs_bin);
    $ucslen = count($ucs);
    if ($ucslen == 0) {
        return;
    }
    if ($mblen == 0) {
        $mblen = 1;
        $ucslen = 1;
    }

    for ($i = 0; $i < $mblen; $i++) {
        printf("\\x%02X", ord(substr($mb, $i, 1)));
    }
    print " ";
    for ($i = 1; $i <= $ucslen; $i++) {
        printf("<U%04X>", $ucs[$i]);
    }
    print "\n";
}

function dump_mb2ucs_1($codeset)
{
    global $singlebyte;
    for ($c1 = 0; $c1 < 0x100; $c1++) {
        $mb = chr($c1);
        $ucs = mb_convert_encoding($mb, "UCS-4", $codeset);
        if (strlen($ucs) > 0
            && (ord($mb[0]) == 0x3F || ucs4ord_1char($ucs) != 0x003F))
        {
            print_mb2ucs_one_char($mb, $ucs);
            $singlebyte[$c1] = 1;
        }
    }
}

function dump_mb2ucs_2($codeset)
{
    global $doublebyte_1st, $doublebyte_2nd;
    for ($c1 = 0; $c1 < 0x100; $c1++) {
        if (is_singlebyte($c1)) {
            continue;
        }
        #for ($c2 = 0x00; $c2 < 0x100; $c2++) {
        for ($c2 = 0x21; $c2 < 0x100; $c2++) {
            if ($c2 == 0x7F || $c2 == 0xFF) {
                continue;
            }
            $mb = chr($c1) . chr($c2);
            $ucs = mb_convert_encoding($mb, "UCS-4", $codeset);
            if (strlen($ucs) > 0
                && (ord($mb[0]) == 0x3F || ucs4ord_1char($ucs) != 0x003F))
            {
                print_mb2ucs_one_char($mb, $ucs);
                $doublebyte_1st[$c1] = 1;
                $doublebyte_2nd[$c2] = 1;
            }
        }
    }
}

function dump_mb2ucs_3($codeset)
{
    #for ($c1 = 0; $c1 < 0x100; $c1++) {
    {
        $c1 = 0x8F;
        if (is_singlebyte($c1)) {
            continue;
        }
        #for ($c2 = 0x00; $c2 < 0x100; $c2++) {
        for ($c2 = 0x21; $c2 < 0x100; $c2++) {
            if (is_doublebyte($c1, $c2)) {
                continue;
            }
            if ($c2 == 0x7F || $c2 == 0xFF) {
                continue;
            }
            #for ($c3 = 0x00; $c3 < 0x100; $c3++) {
            for ($c3 = 0x21; $c3 < 0x100; $c3++) {
                if ($c3 == 0x7F || $c3 == 0xFF) {
                    continue;
                }
                $mb = chr($c1) . chr($c2) . chr($c3);
                $ucs = mb_convert_encoding($mb, "UCS-4", $codeset);
                if (strlen($ucs) > 0
                    && (ord($mb[0]) == 0x3F || ucs4ord_1char($ucs) != 0x003F))
                {
                    print_mb2ucs_one_char($mb, $ucs);
                }
            }
        }
    }
}

function dump_mb2ucs($codeset)
{
    dump_mb2ucs_1($codeset);
    dump_mb2ucs_2($codeset);
    dump_mb2ucs_3($codeset);
}

if ($argc != 2) {
    fputs(STDERR, "Usage: php_mb2ucs.php codeset\n");
    exit(1);
}

dump_mb2ucs($argv[1]);
exit(0);

?>
