(**************************************************************************)
(*  Mana : A kana(romaji)-kanji conversion engine using ChaSen algorithm.    *)
(*  Copyright (C) 2003, 2004  Yamagata Yoriyuki                           *)
(*                                                                        *)
(*  This program is free software; you can redistribute it and/or modify  *)
(*  it under the terms of the GNU General Public License as published by  *)
(*  the Free Software Foundation; either version 2 of the License, or (at  *)
(*  your option) any later version.                                       *)
(*                                                                        *)
(*  This program is distributed in the hope that it will be useful, but   *)
(*  WITHOUT ANY WARRANTY; without even the implied warranty of            *)
(*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     *)
(*  General Public License for more details.                              *)
(**************************************************************************)

(* Copyright 2004 Yamagata Yoriyuki *)

let rec trans state  = function
    [] -> []
  | mrph :: r ->
      let new_state, con_cost = Chasen.connect_cost state mrph in
	if con_cost < 0 then 
	  trans state r 
	else
	  let undef = Chasen.is_undef mrph in
	  let mrph_cost =
	    if undef > 0 then
	      let undef_info = Chasen.undef_info mrph in
		Chasen.undef_info_cost undef_info +
		Chasen.undef_info_cost_step undef_info *
		(Chasen.keyword_len mrph / 2)
	    else
	      let hinsi = Chasen.hinsi mrph in
		Chasen.hinsi_cost hinsi	
	  in
	  let mrph_cost = mrph_cost * 
			  (Chasen.weight mrph) * 
			  Chasen.mrph_cost_weight 
	  in
	  let cost = mrph_cost + con_cost  in
	    (cost, new_state, mrph) :: trans state r

let prefix_mrph_list undef_cache string pos =
  let len = String.length string - pos in
  let mrph_list = Dict.lookup string pos len in
  let m = ref mrph_list in
    for i = 1 to len do
      let undef_list =
	try Hashtbl.find undef_cache (pos, i) with
	    Not_found ->
	      let s = String.sub string pos i  in
	      let list = Chasen.undefwords s in
		Hashtbl.add undef_cache (pos, i) list;
		list
      in
	m := List.rev_append undef_list !m
    done;
    !m 

let is_prefix_string s1 s2 i0 =
  if String.length s1 > String.length s2 then false else
    let rec loop i =
      if i >= String.length s1 then true
      else if s1.[i] = s2.[i0 + i] then loop (i + 1)      else false
    in
      loop 0

let match_hira hira mrph =
  let r = Chasen.keyword mrph in
  let h = Chasen.hiragana r in
    is_prefix_string h hira 0

let prefix_mrph_list_hira undef_cache string pos =
  let len = String.length string - pos in
  let hira = Chasen.hiragana (String.sub string pos len) in
  let mrph_list = Dict.lookup string pos len in
  let m = ref mrph_list in
    for i = 1 to len do
      let undef_list =
	try Hashtbl.find undef_cache (pos, i) with
	    Not_found ->
	      let s = String.sub string pos i  in
	      let list = Chasen.undefwords s in
		Hashtbl.add undef_cache (pos, i) list;
		list
      in
	m := List.rev_append undef_list !m
    done;
    List.filter (match_hira hira) !m

type parse_tree_aux =
    EmptyString
  | Nil
  | Branch of 
      (int ref * int ref * (Chasen.mrph * int) * parse_tree) *
      parse_tree
and parse_tree = parse_tree_aux Lazy.t

let rec b_cost_lb alpha (ra, rb, (mrph, c), pt) =
  if !ra >= alpha then 
    !ra
  else
    let a = c + t_cost_lb (alpha - c) pt in
      ra := a;
      if a < alpha then rb := a;
      a
and t_cost_lb alpha pt = 
  if alpha < 0 then 0 else
    match Lazy.force pt with
	EmptyString -> 1
      | Nil -> alpha
      | Branch (b, _) -> b_cost_lb alpha b

let compare_bcost (ra1, _, _, _) (ra2, _, _, _) = !ra1 - !ra2

exception Fail

let rec b_cost_ub (ra, rb, (mrph, c), pt) =
  if !rb <> 0 then 
    !rb
  else
    let b = c + t_cost_ub pt in
      rb := b;
      b
and t_cost_ub pt =
  match Lazy.force pt with
      EmptyString -> 1
    | Nil -> raise Fail
    | Branch (b, _) -> b_cost_ub b

let rec parse cache undef_cache state string pos =
  if String.length string <= pos then
    EmptyString
  else try
    Hashtbl.find cache (state, pos)
  with Not_found ->
    let mrph_list = prefix_mrph_list undef_cache string pos in
    let trans_list = trans state mrph_list in
    let expand (cost, state, mrph) =
      let pos = pos + Chasen.keyword_len mrph in
	(ref cost, ref 0, 
	 (mrph, cost),
	 lazy (parse cache undef_cache state string pos))
    in
    let b_list = List.map expand trans_list in
    let b_list = List.sort compare_bcost b_list in
    let rec branch0 = function
	[] -> Nil
      | b :: r -> branch b [] r
    and branch b b_list1 = function
	  [] -> 
	    let b_list1 = List.rev b_list1 in
	    Branch (b, lazy (branch0 b_list1))
	| b' :: r ->
	    try
	      let a = b_cost_ub b in
		if b_cost_lb a b' < a then
		  branch b' (b_list1 @ [b]) r
		else
		  branch b (b' :: b_list1) r
	    with Fail ->
	      branch b' b_list1 r
    in
    let pt = branch0 b_list in
      Hashtbl.add cache (state, pos) pt;
      pt

let hiragana_len mrph = String.length (Chasen.keyword mrph)

let rec parse_hira cache undef_cache state string pos =
  if String.length string <= pos then
    EmptyString
  else try
    Hashtbl.find cache (state, pos)
  with Not_found ->
    let mrph_list = prefix_mrph_list_hira undef_cache string pos in
    let trans_list = trans state mrph_list in
    let expand (cost, state, mrph) =
      let pos = pos + hiragana_len mrph in
	(ref cost, ref 0, 
	 (mrph, cost),
	 lazy (parse cache undef_cache state string pos))
    in
    let b_list = List.map expand trans_list in
    let b_list = List.sort compare_bcost b_list in
    let rec branch0 = function
	[] -> Nil
      | b :: r -> branch b [] r
    and branch b b_list1 = function
	  [] -> 
	    let b_list1 = List.rev b_list1 in
	    Branch (b, lazy (branch0 b_list1))
	| b' :: r ->
	    try
	      let a = b_cost_ub b in
		if b_cost_lb a b' < a then
		  branch b' (b_list1 @ [b]) r
		else
		  branch b (b' :: b_list1) r
	    with Fail ->
	      branch b' b_list1 r
    in
    let pt = branch0 b_list in
      Hashtbl.add cache (state, pos) pt;
      pt

let f string =
  let cache = Hashtbl.create 0 in
  let undef_cache = Hashtbl.create 0 in
    lazy (parse cache undef_cache 0 string 0)

let hiragana string =
  let cache = Hashtbl.create 0 in
  let undef_cache = Hashtbl.create 0 in
    lazy (parse_hira cache undef_cache 0 string 0)

let rec mix_branch b1 pt =
  match Lazy.force pt with
      Nil -> Branch (b1, lazy Nil) 
    | EmptyString -> assert false
    | Branch (b2, r) as pt2 ->
	if b1 == b2 then Branch (b1, r) else
	  try
	    let a = b_cost_ub b1 in
	      if a > b_cost_lb a b2 then
		Branch (b2, lazy (mix_branch b1 r))
	      else
		Branch (b1, lazy pt2)
	  with Fail ->
	    pt2
  
let rec split_best pt =
  match Lazy.force pt with
      EmptyString -> ([], lazy Nil)
    | Nil -> raise Not_found
    | Branch ((_, _, (mrph, c), r1), r2) ->
	try
	  let path, pt = split_best r1 in
	    match Lazy.force pt with
		Nil -> mrph :: path, r2
	      | pt1 ->
		  let b1 = (ref c, ref 0, (mrph, c), lazy pt1) in
		    (mrph :: path, lazy (mix_branch b1 r2))
	with Not_found ->
	  split_best r2

let p_cost = t_cost_ub

let rec mix_tree pt1 pt2 =
  lazy (match Lazy.force pt1, Lazy.force pt2 with
      Nil, t -> t
    | t, Nil -> t
    | EmptyString, EmptyString -> EmptyString
    | EmptyString, _ | _, EmptyString -> assert false
    | Branch (b1, r1), Branch (b2, r2) ->
	if b1 == b2 then
	  Branch (b1, mix_tree r1 r2)
	else
	  let b1, r1, b2, r2 =
	    let a = t_cost_ub pt1 in
	      if a <= t_cost_lb a pt2 then
		b1, r1, b2, r2
	      else
		b2, r2, b1, r1
	  in
	    Branch (b1, lazy (mix_branch b2 (mix_tree r1 r2))))

let rec match_head p pt =
  lazy (match Lazy.force pt with
      EmptyString| Nil -> Nil
    | Branch ((_, _, (mrph, _), r1), r2) ->
	if p mrph then
	  Lazy.force (mix_tree r1 (match_head p r2))
	else
	  Lazy.force (match_head p r2))

let list_candidates_aux p pt =
  let cans = Hashtbl.create 0 in
  let rec loop pt =
    match Lazy.force pt with
	EmptyString | Nil -> ()
      | Branch ((_, _, (mrph, _), r1) as b, r2) ->
	  (if p mrph then
	     let s = Conv.surface_form mrph in
	       try 
		 if b_cost_ub b < Hashtbl.find cans s then begin
		   Hashtbl.remove cans s;
		   Hashtbl.add cans s (b_cost_ub b)
		 end
	       with Not_found ->
		 Hashtbl.add cans s (b_cost_ub b));
	  loop r2
  in
    loop pt;
    let cans = Hashtbl.fold (fun key c cans -> (key, c) :: cans) cans [] in
    let cans = List.sort (fun (_, c1) (_, c2) -> c1 - c2) cans in
      List.map fst cans

let list_candidates len =
  list_candidates_aux (fun mrph -> Chasen.keyword_len mrph = len)

let fix_firstmrph word pt =
  let p mrph =
    let d = Chasen.mrph_data_of_mrph mrph in
      word = Chasen.surface_form d
  in
  match_head p pt

let fix_firstmrph_len len pt =
  let p mrph = Chasen.keyword_len mrph = len in
    match_head p pt


let rec best_path pt =
  match Lazy.force pt with
      EmptyString -> []
    | Nil -> raise Not_found
    | Branch ((_, _, (mrph, _), r), _) ->
	mrph :: best_path r

let rec count_char_aux s pos c flag =
  if String.length s <= pos then c else
    let flag' = Char.code (s.[pos]) >= 0x80 in
    let c = c + if flag && flag' then 0 else 1 in
      count_char_aux s (pos + 1) c flag'

let count_char s = count_char_aux s 0 0 false

let fix_firstmrph_len_hira len pt =
  let p mrph = count_char (Chasen.keyword mrph) = len in
    match_head p pt

let list_candidates_hira hira_len =
  let p mrph =
    let keyword = Chasen.keyword mrph in
    let hira = Chasen.hiragana keyword in
      count_char hira = hira_len
  in
    list_candidates_aux p

(* Branch ((alpha, beta, mrph, lazy p1), lazy p2):
   alpha:コスト下限 (0なら下限なし)
   beta:上限 (0なら上限なし)
   p1 : mrph に続く文の構文木
   p2 : mrph から始まらない構文木 *)


(* 空文字列のコストは１*)
