(**************************************************************************)
(*  Mana : A kana(romaji)-kanji conversion engine using ChaSen algorithm.    *)
(*  Copyright (C) 2003, 2004  Yamagata Yoriyuki                           *)
(*                                                                        *)
(*  This program is free software; you can redistribute it and/or modify  *)
(*  it under the terms of the GNU General Public License as published by  *)
(*  the Free Software Foundation; either version 2 of the License, or (at  *)
(*  your option) any later version.                                       *)
(*                                                                        *)
(*  This program is distributed in the hope that it will be useful, but   *)
(*  WITHOUT ANY WARRANTY; without even the implied warranty of            *)
(*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     *)
(*  General Public License for more details.                              *)
(**************************************************************************)

(* $Id: chasen.ml 76 2005-05-04 22:08:15Z yori $ *)

type darts
type mrph_prim
type mrph_data_prim
type undef_info
type hinsi
type ktype
type kform

external romaji : string -> string = "caml_romaji"
external katakana : string -> string = "caml_katakana"
external hiragana : string -> string = "caml_hiragana"
external katakana2hiragana : string -> string = "caml_katakana2hiragana"
external hiragana2katakana : string -> string = "caml_hiragana2katakana"
external chasen_init : unit -> unit = "chasen_init"
let _ = chasen_init ()

external darts_open : string -> string -> string -> darts = "darts_open"
external get_darts : int -> darts = "get_darts"
external ndicfile : unit -> int = "ndicfile"
let ndicfile = ndicfile ()
(* let _ = Printf.printf "ndicfile %d" ndicfile *)

external mrph_posid : mrph_prim -> int = "mrph_posid"
external mrph_inf_type : mrph_prim -> int = "mrph_inf_type"
external mrph_inf_form : mrph_prim -> int = "mrph_inf_form"
external mrph_weight : mrph_prim -> int = "mrph_weight"
external mrph_con_tbl : mrph_prim -> int = "mrph_dat_index"
external mrph_dat_index : mrph_prim -> int = "mrph_dat_index"
external mrph_keyword : mrph_prim -> string = "mrph_keyword"
external mrph_keyword_len : mrph_prim -> int = "mrph_keyword_len"
external mrph_headword : mrph_prim -> string = "mrph_headword"
external mrph_headword_len : mrph_prim -> int = "mrph_headword_len"
external mrph_is_undef : mrph_prim -> int = "mrph_is_undef"
external mrph_darts : mrph_prim -> darts = "mrph_darts"
external mrph_connect_cost : int -> mrph_prim -> (int * int) = "connect_cost"

external new_mrph : int -> int -> int -> int -> int -> int -> string -> string -> int -> darts ->
  mrph_prim = "new_mrph_byte" "new_mrph_opt"

let new_mrph ~posid ~inf_type ~inf_form ~weight ~con_tbl ~dat_index ~keyword ~headword ~is_undef ~darts =
  new_mrph posid inf_type inf_form weight con_tbl dat_index keyword headword is_undef darts
  
external delete_mrph : mrph_prim -> unit = "delete_mrph"
external mrph_default_weight : unit -> int = "mrph_default_weight"
let mrph_default_weight = mrph_default_weight ()
(* let _ = Printf.printf "mrph_default_weight %d" mrph_default_weight *)

external mrph_cost_weight : unit -> int = "mrph_cost_weight"
let mrph_cost_weight = mrph_cost_weight ()
(* let _ = Printf.printf "mrph_cost_weight %d" mrph_cost_weight *)

external mrph_data_posid : mrph_data_prim -> int = "mrph_data_posid"
external mrph_data_inf_type : mrph_data_prim -> int = "mrph_data_inf_type"
external mrph_data_inf_form : mrph_data_prim -> int = "mrph_data_inf_form"
external mrph_data_weight : mrph_data_prim -> int = "mrph_data_weight"
external mrph_data_con_tbl : mrph_data_prim -> int = "mrph_data_dat_index"
external mrph_data_dat_index : mrph_data_prim -> int = "mrph_data_dat_index"
external mrph_data_keyword : mrph_data_prim -> string = "mrph_data_keyword"
external mrph_data_keyword_len : mrph_data_prim -> int = "mrph_data_keyword_len"
external mrph_data_is_undef : mrph_data_prim -> int = "mrph_data_is_undef"
external mrph_data_darts : mrph_data_prim -> darts = "mrph_data_darts"
external mrph_data_headword : mrph_data_prim -> string = "mrph_data_headword"
external mrph_data_headword_len : mrph_data_prim -> int = "mrph_data_headword_len"
external mrph_data_reading : mrph_data_prim -> string = "mrph_data_headword"
external mrph_data_reading_len : mrph_data_prim -> int = "mrph_data_reading_len"
external mrph_data_pron : mrph_data_prim -> string = "mrph_data_pron"
external mrph_data_pron_len : mrph_data_prim -> int = "mrph_data_pron_len"
external mrph_data_stem_len : mrph_data_prim -> int = "mrph_data_stem_len"
external mrph_data_base : mrph_data_prim -> string = "mrph_data_base"
external mrph_data_info : mrph_data_prim -> string = "mrph_data_info"
external mrph_data_compound : mrph_data_prim -> int = "mrph_data_compound"

external delete_mrph_data : mrph_data_prim -> unit = "delete_mrph_data"
external mrph_data_of_mrph : mrph_prim -> mrph_data_prim = "mrph_data_of_mrph"

external darts_lookup : darts -> string -> int -> int -> mrph_prim array = "darts_lookup"
external darts_lookup_prefix : darts -> string -> int -> int -> (mrph_prim * int) array = "darts_lookup_prefix"

external undef_info_cost : undef_info -> int = "undef_info_cost"
external undef_info_cost_step : undef_info -> int = "undef_info_cost_step"
external undef_info_con_tbl : undef_info -> int = "undef_info_con_tbl"
external undef_info_hinsi : undef_info -> int = "undef_info_hinsi"
external get_undef_info : int -> undef_info = "get_undef_info"

external undef_hinsi_max : unit -> int = "undef_hinsi_max"
let undef_hinsi_max = undef_hinsi_max ()

external undef_info_num : unit -> int = "undef_info_num"
let undef_info_num = undef_info_num ()

external hinsi_composit : hinsi -> int = "hinsi_composit"
external hinsi_depth : hinsi -> int = "hinsi_depth"
external hinsi_kt : hinsi -> int = "hinsi_kt"
external hinsi_cost : hinsi -> int = "hinsi_cost"
external get_hinsi : int -> hinsi = "get_hinsi"

external ktype_basic : ktype -> int = "ktype_basic"
external get_ktype : int -> ktype = "get_ktype"

external kform_gobi : kform -> string = "kform_gobi"
external kform_gobi_len : kform -> int = "kform_gobi_len"
external kform_ygobi : kform -> string = "kform_ygobi"
external kform_pgobi : kform -> string = "kform_pgobi"
external get_kform : int -> int -> kform = "get_kform"

type mrph = [`Mrph of mrph_prim | `Mrph_data of mrph_data_prim]
type mrph_data = [`Mrph_data of mrph_data_prim]

let posid = function
    `Mrph mrph -> mrph_posid mrph
  | `Mrph_data data -> mrph_data_posid data

let inf_type = function
    `Mrph mrph -> mrph_inf_type mrph
  | `Mrph_data data -> mrph_data_inf_type data

let inf_form = function
    `Mrph mrph -> mrph_inf_form mrph
  | `Mrph_data data -> mrph_data_inf_form data

let weight = function
    `Mrph mrph -> mrph_weight mrph
  | `Mrph_data data -> mrph_data_weight data

let con_tbl = function
    `Mrph mrph -> mrph_weight mrph
  | `Mrph_data data -> mrph_data_weight data

let keyword = function
    `Mrph mrph -> mrph_keyword mrph
  | `Mrph_data data -> mrph_data_keyword data

let keyword_len = function
    `Mrph mrph -> mrph_keyword_len mrph
  | `Mrph_data data -> mrph_data_keyword_len data

let is_undef = function
    `Mrph mrph -> mrph_is_undef mrph
  | `Mrph_data data -> mrph_data_is_undef data

let undef_info mrph =
  get_undef_info (is_undef mrph)

let hinsi mrph =
  get_hinsi (posid mrph)

let darts = function
    `Mrph mrph -> mrph_darts mrph
  | `Mrph_data data -> mrph_data_darts data

let headword = function
    `Mrph mrph -> mrph_headword mrph
  | `Mrph_data data -> mrph_data_headword data

let headword_len = function
    `Mrph mrph -> mrph_headword_len mrph
  | `Mrph_data data -> mrph_data_headword_len data

let delete_mrph = function
    `Mrph m -> delete_mrph m
  | `Mrph_data m -> delete_mrph_data m

let make_mrph mp = 
  let mrph = `Mrph mp in
    Gc.finalise delete_mrph mrph;
    mrph

let create_undefword keyword headword weight no =
  let undef_info = get_undef_info no in
  let mrph_prim = 
    new_mrph 
      ~posid:(undef_info_hinsi undef_info)
      ~dat_index:~-1
      ~inf_type:0
      ~inf_form:0
      ~weight:weight
      ~con_tbl:(undef_info_con_tbl undef_info)
      ~keyword
      ~headword
      ~is_undef:(no + 1)
      ~darts:(Obj.magic 0) in
  make_mrph mrph_prim

let undefwords keyword =
  let rec loop no =
    if no >= undef_info_num then [] else
      let hira = create_undefword keyword keyword mrph_default_weight no in
      let kata = 
	create_undefword 
	  keyword 
	  (hiragana2katakana keyword) 
	  (mrph_default_weight*2)
	  no in
      let roma = 
	create_undefword 
	  keyword 
	  (romaji keyword) 
	  (mrph_default_weight*3)
	  no in
	hira :: kata :: roma :: loop (no + 1) in
    loop 0

let darts_lookup string pos len =
  let pos = EucString.nth string 0 pos in
  let len = EucString.nth string pos len - pos in
  let ret = ref [] in
  for i = 0 to ndicfile - 1 do
    let x = darts_lookup (get_darts i) string pos len in
    let x = Array.to_list x in
    let x = List.map make_mrph x in
    ret := List.rev_append x !ret
  done;
    !ret

let darts_lookup_prefix string pos len =
  let pos = EucString.nth string 0 pos in
  let len = EucString.nth string pos len - pos in
  let ret = ref [] in
  for i = 0 to ndicfile - 1 do
    let x = darts_lookup_prefix (get_darts i) string pos len in
    let x = Array.to_list x in
    let f (mrph, byte_len) =
      let len = EucString.count string pos (pos + byte_len) in
      (make_mrph mrph, len) in
    let x = List.map f x in
    ret := List.rev_append x !ret
  done;
    !ret

let connect_cost posid = function
    `Mrph mrph -> mrph_connect_cost posid mrph
  | `Mrph_data data ->
      mrph_connect_cost posid (Obj.magic data)
  
let delete_mrph_data = function `Mrph_data d -> delete_mrph_data d

let mrph_data_of_mrph = function 
    `Mrph mrph ->
      let d = mrph_data_of_mrph mrph in
      let md = `Mrph_data d in
	Gc.finalise delete_mrph_data md;
	md
  | (`Mrph_data _) as x -> x
    
let mrph_of_mrph_data ((`Mrph_data _) as x) = x

let reading = function `Mrph_data d -> mrph_data_reading d
let reading_len = function `Mrph_data d -> mrph_data_reading_len d
let pron = function `Mrph_data d -> mrph_data_pron d
let pron_len = function `Mrph_data d -> mrph_data_pron_len d
let stem_len = function `Mrph_data d -> mrph_data_stem_len d
let base = function `Mrph_data d -> mrph_data_base d
let info = function `Mrph_data d -> mrph_data_info d
let compound = function `Mrph_data d -> mrph_data_compound d

let surface_form d =
  let inf_type = inf_type d in
  let headword = headword d in
    if inf_type > 0 && stem_len d >= 0 then
      let kform = get_kform inf_type (inf_form d) in
      let gobi = kform_gobi kform in
	headword ^ gobi
    else
      headword
