#!/usr/bin/ruby -Ke
# ssm-chasen-proxy.rb by Takuya Nishimoto 
# since 2008-08-26
# 
# - depends on mecab + ipadic
# -- tested with Ubuntu 8.04.1
# -- sudo apt-get install mecab mecab-ipadic
# - works as chasen-2.4.x-istc + unidic-chasen-1.3.9
# - called from gtalk
#

require 'rexml/parsers/streamparser'
require 'rexml/parsers/baseparser'
require 'rexml/streamlistener'
require 'nkf'

class MeCab
  def initialize
    @io = IO.popen("/usr/bin/mecab","r+")
  end
  def parse(input)
    @io.puts(input)
    word_list = []
    str = @io.gets.chomp
    while str != 'EOS' do
      word_list << str.split("\t")
      str = @io.gets.chomp
    end
    word_list
  end
end

class MyListener
  include REXML::StreamListener
  def initialize
    super
    @mecab = MeCab.new
  end
  def tag_start(name, attrs)
    x = name.dup
    attrs.each_pair do |k,v|
      x += ' ' + k
      x += '="' + v + '"'
    end
    s = NKF.nkf('-e', x)
    puts "<#{s}>"
  end
  def tag_end s
    puts "</#{s}>"
  end
  def text x
    s = NKF.nkf('-e', x)
    @mecab.parse(s).each do |w|
      orth = w[0]
      a = w[1].split(',')
      kana = a[7]
      pron = a[8]
      pos = a[0]
      kanaBase = a[6]
      kana = orth if kana == nil
      pron = orth if pron == nil
      goshu = ''
      case a[0]
      when '̾' then pos = '̾-̾-'
      when 'ư' then pos = 'ư-'
      when '' then pos = 'ư-ʽ'
      end
      puts <<EOS
<W1 orth="#{orth}" kana="#{kana}" pron="#{pron}" pos="#{pos}" orthBase="#{orth}"
 kanaBase="#{kana}"  pronBase="#{pron}" lForm="#{kana}" lemma="#{orth}" form="#{kana}"
 aType="1" aConType="C1" goshu="#{goshu}">#{orth}</W1>
EOS
    end	
  end
end

listener = MyListener.new

until STDIN.eof? do
  str = STDIN.gets
  if str != nil 
    puts "<S>"
    source = NKF.nkf('-w', str.chomp)
    REXML::Parsers::StreamParser.new(source, listener).parse
    puts "</S>"
    STDOUT.flush
  end
end









