#!/usr/bin/perl -CS

# KanjiVG XML to IDSgrep EIDS translator
# Copyright (C) 2012  Matthew Skala
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Matthew Skala
# http://ansuz.sooke.bc.ca/
# mskala@ansuz.sooke.bc.ca

use XML::Parser;
use utf8;

$svnid='$Id: kvg2eids 1050 2012-01-03 02:52:22Z mskala $';

$parser=new XML::Parser(Style=>'Tree');

$parser->setHandlers(Comment=>\&dict_comment);

sub dict_comment {
  my($expat,$data)=@_;
  print "〖$data〗;\n";
}

print "〖EIDS kanji decomposition dictionary generated by kvg2eids\n"
     ."$svnid\n"
     ."Copyright notice below is from the input XML file.〗;\n";

$/=undef;
$_=<>;
$db=$parser->parse($_);

($db->[0]=='kanjivg') || die "unknown document tag";

for ($i=1;$i<=$#{$db->[1]};$i+=2) {
  next if $db->[1]->[$i] eq 0;
  if ($db->[1]->[$i] ne 'kanji') {
    print '〖unknown top-level tag: '.$db->[1]->[$i]."〗;\n";
    next;
  }
  if ($db->[1]->[$i+1]->[0]->{'id'}=~/kvg:kanji_([0-9a-f]+)/) {
    $topcode=$1;
  } else {
    $topcode=0;
  }
  for ($j=1;$j<=$#{$db->[1]->[$i+1]};$j+=2) {
    next if $db->[1]->[$i+1]->[$j] eq 0;
    if ($db->[1]->[$i+1]->[$j] ne 'g') {
      print '〖unknown tag inside kanji: '.$db->[1]->[$i+1]->[$j]."〗;\n";
      next;
    }
    print &make_eids($db->[1]->[$i+1]->[$j+1],1,$topcode)."\n";
  }
}

sub make_eids {
  my($g,$istop,$topcode)=@_;
  my($rval,%pos,$i);
  my($pass_istop)=$istop;

  if (($#$g==2) && ($g->[1] eq 'g')) {
    return &make_eids($g->[2],$pass_istop);
  }

  if (defined $g->[0]->{'kvg:element'}) {
    if ($istop) {
      $rval='【'.$g->[0]->{'kvg:element'}.'】';
      $pass_istop=0;
    } else {
      $rval='<'.$g->[0]->{'kvg:element'}.'>';
    }
  } elsif ($istop && $topcode) {
      $rval='【'.chr(hex("0x$topcode")).'】';
      $pass_istop=0;
  } else {
    $rval='';
  }
  
  for ($i=1;$i<=$#$g;$i+=2) {
    if ($g->[$i] eq 0) {
      $pos{'text'}=$g->[$i+1] if ($g->[$i+1])=~/\S/;
    } elsif (($g->[$i] eq 'g')
          && (defined $g->[$i+1]->[0]->{'kvg:position'})) {
      $pos{$g->[$i+1]->[0]->{'kvg:position'}}=$g->[$i+1];
    } else {
      $pos{'other'}=$g->[$i+1];
    }
  }
  
  my($splitout,$splitop)=('','');
  if (defined $pos{'nyo'}) {
    $splitout='nyo';
    $splitop='⿺';
  } elsif (defined $pos{'tare'}) {
    $splitout='tare';
    $splitop='⿸';
  }
  
  if ($splitout ne '') {
    my($lc,$rc)=([{}],[{}]);
    for ($i=1;$i<=$#$g;$i+=2) {
      if (($g->[$i] eq 'g')
       && ($g->[$i+1]->[0]->{'kvg:position'} eq $splitout)) {
        push @$lc,$g->[$i];
        push @$lc,$g->[$i+1];
        delete $lc->[$#$lc]->[0]->{'kvg:position'};
      } elsif ($g->[$i] ne 0) {
        push @$rc,$g->[$i];
        push @$rc,$g->[$i+1];
      }
    }
    $rval.=($splitop.&make_eids($lc,$pass_istop)
                    .&make_eids($rc,$pass_istop));
    return $rval;
  }
  
  my($plist)=join(' ',sort keys %pos);
  
  if ($plist eq 'bottom top') {
    $rval.=('⿱'.&make_eids($pos{'top'},$pass_istop)
           .&make_eids($pos{'bottom'},$pass_istop));
    return $rval;
  } elsif ($plist eq 'left right') {
    $rval.=('⿰'.&make_eids($pos{'left'},$pass_istop)
           .&make_eids($pos{'right'},$pass_istop));
    return $rval;
  } elsif (($plist ne '') && ($plist ne 'other')){
    $rval.="($plist)";
  } elsif ($rval eq '') {
    $rval='?';
  } elsif ((defined $g->[0]->{'kvg:element'}) && !$istop) {
    $rval=$g->[0]->{'kvg:element'};
  } else {
    $rval.=';';
  }

  $rval;
}
