/*
 * ʸκǾñ̤Ǥwordlist
 *
 * init_word_seq_tab()
 *   °ơ֥ΥΡɤؤΥݥ󥿤ν
 * release_word_seq_tab()
 *   °ơ֥β
 * anthy_make_word_list_all() 
 * ʸηʬʸ󤹤
 *
 * Funded by IPA̤Ƨեȥ¤ 2002 2/27
 * Copyright (C) 2000-2002 TABATA Yusuke, UGAWA Tomoharu
 *
 * $Id: wordlist.c,v 1.38 2002/06/16 10:56:32 yusuke Exp $
 *
 */

//#define DEBUG_CONJUGATE_TABLE

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <alloc.h>
#include <record.h>
#include <xstr.h>
#include <wtype.h>
#include <conf.h>
#include <ruleparser.h>
#include <dic.h>
#include <splitter.h>
#include "wordborder.h"

static allocator wordseq_rule_ator;

/*
 * +ź x, y Ф
 *  x < y ʤ s[x] < s[y]
 * +ΩʿĹ AФ
 *  f(2*A - x) + f(x)  x = A Ǻ
 *  f(a+b) > f(a)+f(b)
 *
 * int f(int x){if (x) return x*f(x-1);return 1;}
 * int c(int x,int y){return f(x)/(f(y)*(f(x-y)));}
 * int main(){int i,j;for(i=0,j=0;i<10;i++){j+=c(9,i);printf("%d,",j);}}
 */
#define SCORE_BY_CORE_LEN_TAB_MAX 9
static const int score_by_core_len[] =
/*{1,1,10,46,130,256,382,466,502,511,512};*/
/*{1,1,3,27,81,243,729,2187,6561,19683,59049};*/
  {1,1,8,27,64,125,216,343,512,729,1000};
struct wordseq_rule{
  wtype_t wt;
  char *name; /* ʻ̾ */
  int node_id; /* °쥰ΥΡid */
  struct wordseq_rule *next;
};

/* ñ³롼 */
static struct wordseq_rule *gRules;

/* Rule */
static void parse_line(char **, int nr);

static void make_pre_words(struct splitter_context *, struct word_list *);
static void make_suc_words(struct splitter_context *, struct word_list *);
static void make_following_word_list(struct splitter_context *,
				     struct word_list *);
static int calc_suffix_affinity(seq_ent_t core, seq_ent_t suf,
				int len, int suc);
static int calc_score_by_len(int x);
static int log2(int );
static void wordseq_rule_dtor(void *);

/* word_list */
static void anthy_make_expanded_word_list(struct splitter_context *sc,
					  struct word_list *wl);
static void make_word_list(struct splitter_context *c, seq_ent_t se,
			   int from, int len);
static void make_dummy_head(struct splitter_context *c);
static void setup_word_list(struct word_list *, int, int);

/* ǥХå */
void
anthy_print_word_list(struct splitter_context *sc,
		      struct word_list *wl)
{
  xstr xs;
  const char *wn = "---";
  if (!wl) {
    printf("--\n");
    return ;
  }
  /* Ƭ */
  xs.len = wl->core_from - wl->from;
  xs.str = sc->ce[wl->from].c;
  anthy_putxstr(&xs);
  printf(".");
  /* Ω */
  xs.len = wl->core_len;
  xs.str = sc->ce[wl->core_from].c;
  anthy_putxstr(&xs);
  printf(".");
  /*  */
  xs.len = wl->postfix_len;
  xs.str = sc->ce[wl->core_from + wl->core_len].c;
  anthy_putxstr(&xs);
  printf("-");
  /* ° */
  xs.len = wl->dep_len;
  xs.str = sc->ce[wl->core_from + wl->core_len + wl->postfix_len].c;
  anthy_putxstr(&xs);
  if (wl->core_wt_name) {
    wn = wl->core_wt_name;
  }
  printf(" %s %d %d\n", wn, wl->score, wl->conn_score);
}

static void
wordseq_rule_dtor(void *p)
{
  struct wordseq_rule *r = p;
  free(r->name);
}

static int
calc_score_by_len(int len)
{
  if (len <= SCORE_BY_CORE_LEN_TAB_MAX) {
    return score_by_core_len[len];
  }
  /* linear increase */
  return
    score_by_core_len[SCORE_BY_CORE_LEN_TAB_MAX] * len /
    SCORE_BY_CORE_LEN_TAB_MAX;
}

static int
log2(int x)
{
  if (x < 64) {
    if (x < 32) {
      if (x < 16) {
	if (x < 4) {
	  if (x < 2) {
	    return 1;
	  }
	  return 2;
	}
	return 3;
      }
      return 4;
    }
    return 5;
  }
  return 5 + log2(x/64);
}

/* ؽ°γĥѤword_list */
static void
anthy_make_expanded_word_list(struct splitter_context *sc,
			      struct word_list *wl)
{
  int i, nr, rv;
  xstr dep, tail;

  /* °keyˤơĥ줿°õ */
  rv = anthy_select_section("EXPAND_DEP", 0);
  if (rv == -1) {
    return ;
  }

  dep.str = sc->ce[wl->from + wl->len - wl->dep_len].c;
  dep.len = wl->dep_len;

  rv = anthy_select_column(&dep, 0);
  if (rv == -1) {
    return ;
  }

  tail.str = sc->ce[wl->from + wl->len].c;

  /* ĥγƥȥФ */
  nr = anthy_get_nr_values();
  for (i = 0; i < nr; i++) {
    xstr *xs;

    xs = anthy_get_nth_xstr(i);
    if (!xs) {
      continue;
    }

    tail.len = xs->len;
    if (tail.len + wl->from + wl->len > sc->char_count) {
      continue;
    }
    if (!anthy_xstrcmp(&tail, xs)) {
      struct word_list *new_one;
      new_one = anthy_alloc_word_list(sc);
      *new_one = *wl;
      new_one->dep_len += xs->len;
      new_one->len += xs->len;

      anthy_commit_word_list(sc, new_one);
    }
  }
}

/* äword_listΥ׻Ƥ饳ߥåȤ */
void 
anthy_commit_word_list(struct splitter_context *sc,
		       struct word_list *wl)
{
  int f;

  /* ˤ°ФĥƳꤵ줿 */
  anthy_make_expanded_word_list(sc, wl);

  f = log2(wl->freq)+10;
  if (f > 20) {
    f = 20;
  }

  wl->score = 0;

  /* ΩΥ٤ˤ */
  wl->score += calc_score_by_len(wl->core_len)*f;

  /* °Ф */
  if (wl->dep_len) {
    wl->score += (100 * wl->dep_len);
  }

  /* ƬФ븺 */
  wl->score -= 10000*(wl->postfix_len + wl->prefix_len);

  /* °³ˤ븺 */
  wl->score *= wl->conn_score;
  wl->score /= 256;

  /* Ĺˤ */
  wl->score = (SCORE_PER_LEN + wl->score)* wl->len;

  /* wordlistΥꥹȤɲ */
  wl->next = sc->word_split_info->cnode[wl->from].wl;
  sc->word_split_info->cnode[wl->from].wl = wl;

  //anthy_print_word_list(sc, wl);
}

struct word_list *
anthy_alloc_word_list(struct splitter_context *sc)
{
  return anthy_smalloc(sc->word_split_info->WlAllocator);
}

/* ΩƬοå */
static int
calc_suffix_affinity(seq_ent_t core, seq_ent_t suf,
		     int len, int isSuc)
{
  int a = 0;
  if ((anthy_get_seq_flag(suf) & SF_NUM ) &&
      (anthy_get_seq_flag(core) & NF_NUM)) {
    a += 10000 * len;
  }
  /* Ȥaffinity */
  if (isSuc) {
    if ((anthy_get_seq_flag(suf) & SF_JN) &&
	(anthy_get_seq_flag(core) & NF_NAME)) {
      a += 10000 * len;
    }
  }
  
  return a;
}

/* ³ν졢ưդ */
static void
make_following_word_list(struct splitter_context *sc,
			 struct word_list *tmpl)
{
  xstr xs;
  xs.str = sc->ce[tmpl->from+tmpl->len].c;
  xs.len = sc->char_count - tmpl->from - tmpl->len;
  if (tmpl->node_id == -1) {
    struct wordseq_rule *r;
    struct word_list new_tmpl;
    new_tmpl = *tmpl;
    for (r = gRules; r; r = r->next) {
      new_tmpl.core_wt = r->wt;
      new_tmpl.core_wt_name = r->name;
      new_tmpl.node_id = r->node_id;
      anthy_scan_node(sc, &new_tmpl, &xs, new_tmpl.node_id);
    }
  } else {
    anthy_scan_node(sc, tmpl, &xs, tmpl->node_id);
  }
}

/* 򤯤äĤ */
static void 
make_suc_words(struct splitter_context *sc,
	       struct word_list *tmpl)
{
  int i, right;

  right = tmpl->core_from + tmpl->core_len;
  for (i = 1;
       i <= sc->word_split_info->seq_len[right];
       i++){
    xstr xs;
    seq_ent_t s;
    xs.str = sc->ce[right].c;
    xs.len = i;
    s = anthy_get_seq_ent_from_xstr(&xs);
    if (anthy_get_seq_ent_pos(s, POS_SUC)) {
      struct word_list new_tmpl;
      new_tmpl = *tmpl;
      new_tmpl.len += i;
      new_tmpl.postfix_len += i;
      new_tmpl.postfix_wt = anthy_wtype_postfix;
      new_tmpl.score += calc_suffix_affinity(new_tmpl.core_seq, s, i, 1);
      make_following_word_list(sc, &new_tmpl);
    }
  }
}

/* Ƭ򤯤äĤƤ򤯤äĤ */
static void
make_pre_words(struct splitter_context *sc,
	       struct word_list *tmpl)
{
  int i;
  /* Ƭ󤹤 */
  for (i = 1; 
       i <= sc->word_split_info->rev_seq_len[tmpl->core_from]; i++) {
    xstr xs;
    seq_ent_t s;
    xs.str = sc->ce[tmpl->core_from-i].c;
    xs.len = i;
    s = anthy_get_seq_ent_from_xstr(&xs);
    if (anthy_get_seq_ent_pos(s, POS_PRE)) {
      struct word_list new_tmpl;
      new_tmpl = *tmpl;
      new_tmpl.from = tmpl->from - i;
      new_tmpl.len = tmpl->len + i;
      new_tmpl.score += calc_suffix_affinity(new_tmpl.core_seq, s, i, 0);
      new_tmpl.prefix_len += i;
      new_tmpl.prefix_wt = anthy_wtype_prefix;
      make_following_word_list(sc, &new_tmpl);
      if (anthy_get_seq_flag(tmpl->core_seq) & NF_NUM) {
	/* ξ⤯äĤ */
	make_suc_words(sc, &new_tmpl);
      }
    }
  }
}

static void
setup_word_list(struct word_list *wl, int from, int len)
{
  wl->from = from;
  wl->len = len;
  wl->core_from = from;
  wl->core_len = len;
  wl->postfix_len = 0;
  wl->prefix_len = 0;
  wl->dep_len = 0;
  wl->score = 0;
  wl->conn_score = 256;
  wl->core_wt = anthy_wt_none;
  wl->core_seq = 0;
  wl->core_wt_name = NULL;
  wl->node_id = -1;
  wl->prefix_wt = anthy_wt_none;
  wl->postfix_wt = anthy_wt_none;
  wl->freq = 1;/* ٤㤤ñȤƤ */
}

/*
 * ΩФơƬ°դΤ
 * ʸθ(=word_list)Ȥcacheɲä
 */
static void
make_word_list(struct splitter_context *sc,
	       seq_ent_t se,
	       int from, int len)
{
  struct word_list tmpl;
  struct wordseq_rule *r;

  /* ƥץ졼Ȥν */
  setup_word_list(&tmpl, from, len);
  tmpl.core_seq = se;

  for (r = gRules; r; r = r->next) {
    int freq = anthy_get_seq_ent_wtype_freq(se, r->wt);
    if (freq) {
      /* ΩʻϤΥ롼ˤäƤ */
#ifdef DEBUG_CONJUGATE_TABLE
      xstr xs;
      xs.str = sc->ce[tmpl.core_from].c;
      xs.len = tmpl.core_len;
      anthy_putxstr(&xs);
      printf(" %s %d\n", r->name, freq);
#endif
      tmpl.core_wt = r->wt;
      tmpl.freq = freq;
      tmpl.core_wt_name = r->name;
      tmpl.node_id = r->node_id;
      if (anthy_wtype_get_pos(r->wt) == POS_NOUN) {
	/* Ƭ̾ˤդʤȤˤƤ */
	if (anthy_get_seq_flag(se) & SF_NUM) {
	  make_pre_words(sc, &tmpl);
	}
	make_suc_words(sc, &tmpl);
      }
      /* Ƭ̵ǽưĤ */
      make_following_word_list(sc, &tmpl);
    }
  }
}

static void
make_dummy_head(struct splitter_context *sc)
{
  struct word_list tmpl;
  setup_word_list(&tmpl, 0, 0);
  tmpl.core_seq = 0;
  tmpl.core_wt = anthy_wtype_noun;
  tmpl.score = SCORE_PER_LEN;
  make_suc_words(sc, &tmpl);
}

/* ƥȤʸƤword_list󤹤 */
void 
anthy_make_word_list_all(struct splitter_context *sc)
{
  int i, j;
  xstr xs;
  seq_ent_t se;
  struct depword_ent{
    struct depword_ent *next;
    int from, len;
    seq_ent_t se;
  } *head, *de;
  struct word_split_info_cache *info;
  allocator de_ator;

  info = sc->word_split_info;
  head = 0;
  de_ator = anthy_create_allocator(sizeof(struct depword_ent), 0);

  /* ƤμΩ */
  /* Υ롼 */
  for (i = 0; i < sc->char_count ; i++) {
    int search_len = sc->char_count - i;
    int search_from = 0;
    if (search_len > 30) {
      search_len = 30;
    }
    /* ʸĹΥ롼(Ĺ) */
    for (j = search_len; j > search_from; j--) {
      xs.len = j;
      xs.str = sc->ce[i].c;

      se = anthy_get_seq_ent_from_xstr(&xs);

      if (se) {
	/* ơʬʸñʤ */

	/* ƬκĹĴ٤ */
	if (j > info->seq_len[i] &&
	    anthy_get_seq_ent_pos(se, POS_SUC)) {
	  info->seq_len[i] = j;
	}
	if (j > info->rev_seq_len[i + j] &&
	    anthy_get_seq_ent_pos(se, POS_PRE)) {
	  info->rev_seq_len[i + j] = j;
	}
	/* ȯΩꥹȤɲ */
	if (anthy_get_seq_ent_indep(se)) {
	  de = (struct depword_ent *)anthy_smalloc(de_ator);
	  de->from = i;
	  de->len = j;
	  de->se = se;
	  de->next = head;
	  head = de;
	}
      }
    }
  }

  /* ȯΩƤФ°ѥθ */
  for (de = head; de; de = de->next) {
    make_word_list(sc, de->se, de->from, de->len);
  }

  /* Ƭ0ʸμΩդ */
  make_dummy_head(sc);

  anthy_free_allocator(de_ator);
}

static void parse_line(char **tokens, int nr)
{
  struct wordseq_rule *r;
  if (nr < 2) {
    printf("Syntex error in indepword defs"
	   " :%d.\n", anthy_get_line_number());
    return ;
  }
  /* ԤƬˤʻ̾äƤ */
  r = anthy_smalloc(wordseq_rule_ator);
  r->name = strdup(tokens[0]);
  anthy_name_to_wtype(tokens[0], &r->wt);
  /* μˤϥΡ̾äƤ */
  r->node_id = anthy_get_node_id_by_name(tokens[1]);
  /* 롼ɲ */
  r->next = gRules;
  gRules = r;
}

/* °쥰դɤ߹ */
static int 
init_word_seq_tab(void)
{
  const char *fn;
  char **tokens;
  int nr;

  wordseq_rule_ator = anthy_create_allocator(sizeof(struct wordseq_rule),
					     wordseq_rule_dtor);

  fn = anthy_conf_get_str("INDEPWORD");
  if (!fn){
    printf("independent word dict unspecified.\n");
    return -1;
  }
  if (anthy_open_file(fn) == -1) {
    printf("Failed to open indep word dict (%s).\n", fn);
    return -1;
  }
  gRules = NULL;
  while (!anthy_read_line(&tokens, &nr)) {
    parse_line(tokens, nr);
    anthy_free_line();
  }
  anthy_close_file();

  return 0;
}

int anthy_init_wordlist(void)
{
  return init_word_seq_tab();
}
