/*
  pronounce     Ʊ۵Ф
  pronounce.cc

 Copyright (C) 2006 Masahiko Higashiyama  All rights reserved.
 This is free software with ABSOLUTELY NO WARRANTY.

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 02111-1307, USA
*/

#include <iostream>
#include <fstream>
#include <sstream>
#include <map>
#include <utility>
#include <cmath>
#include <unistd.h>

#define USAGE " [options] \n\
\n\
  -e, set threshold entropy.\n\
  -f, set threshold frequency. \n\
  -i, set input filename.\n\
  -o, set output filename.\n\
  -t, set task 0 or 1.\n\
  -v, set verbose mode.\n\
  -V, show the version.\n\
  -h, show this help."

#define VERSION "0.01"
#define PACKAGE "PRONOUNCE"


typedef std::map <std::string, int>::value_type value_pair;
typedef std::map <std::string, int> count_map;


class Pronounce {

private:
  std::istream  *ifs;
  std::ostream  *ofs;

  bool verbose;
  bool file_flg;
  bool is_print;
  int task_flg;

  int freq;
  double entropy;

  std::map <std::string, std::map <std::string, int> > m;

  inline double calc_entropy(int count, std::map<std::string, int>& m2);
  inline int get_count(std::map<std::string, int>& m2);
  inline void parse_string(const std::string& str);
  void read_file(void);
  void print_intermediate();
  void print_result(const std::string& key, count_map& m2, double entropy_o, int count, int freq);
  inline void selection(void);

public:

  Pronounce(){verbose = false; file_flg = false; task_flg = -1;};
  ~Pronounce(){
    if(file_flg){
      delete ifs;
    }
  };

  void read(const char* filename){

    ifs = &std::cin;

    if (strcmp(filename, "") != 0 && strcmp(filename, "-") != 0){
      ifs = new std::ifstream(filename);
      file_flg = true;
      if(! *ifs){
	std::cerr << "Can't open " << filename << std::endl;
	exit(-1);
      }
    }
  }

  void set_writefile(const char* filename){

    ofs = &std::cout;

    if (strcmp(filename, "") != 0 && strcmp(filename, "-") != 0){
      ofs = new std::ofstream(filename);
      file_flg = true;
      if(! *ofs){
	std::cerr << "Can't open " << filename << std::endl;
	exit(-1);
      }
    }
  }

  void run(void);

  
  void set_task(int task){
    task_flg = task;
  }

  void set_freq(int f){
    freq = f;
  }

  void set_entropy(double e){
    entropy = e;
  }

  void set_verbose(bool b){
    verbose = b;
  }

};

inline double
Pronounce::calc_entropy(int count, std::map<std::string, int>& m2){
  
  double entropy = 0.0;
  double base = log(2);
  for(count_map::iterator itr = m2.begin(); itr != m2.end(); ++itr){
    double p = (double)(itr->second) / (double)count;
    entropy -= (p * log(p)) / base;
  }
  
  return entropy;
}

inline int 
Pronounce::get_count(std::map<std::string, int>& m2){

    count_map::iterator count_itr = m2.find("%%count%%");
    if(count_itr == m2.end()) return 0;

    int count = count_itr->second;
    m2.erase("%%count%%");

    return count;

}

inline void
Pronounce::parse_string(const std::string& str)
{

  std::istringstream iss(str);

  if(task_flg == 1){

    std::string tmp;
    std::string key;
    std::map<std::string,int> cnt_m;

    iss >> key;

    while(!iss.eof() && (iss >> tmp)){
      std::istringstream iss2(tmp);
      std::string tmp2, tmp3;
      char c;

      while(iss2.get(c) && (c != ':')) tmp2.push_back(c);
      while(iss2.get(c)) tmp3.push_back(c);
      cnt_m.insert(value_pair(tmp2.c_str(), atoi(tmp3.c_str())));
    }
    m.insert(std::pair<std::string, std::map<std::string, int> >(key, cnt_m));
  }else{

    std::string surface, base, pronounce, pos;
    iss >> surface;
    iss >> base;
    iss >> pronounce;
    iss >> pos;
  
    if(pos == "ư" || pos == "ƻ" || pos == "̾" || pos == "̤θ"){
      m[pronounce][surface] += 1;
      m[pronounce]["%%count%%"] += 1;
    }

  }

}


inline void
Pronounce::selection(void){

  int freq1 = freq;
  for(std::map<std::string, std::map<std::string, int> >::iterator itr = m.begin();
      itr != m.end(); ++itr){
    std::map<std::string, int>& m2 = itr->second;
    int count = get_count(m2);
    if(count == 0) continue;

    double entropy_o = calc_entropy(count, m2);
    if(entropy > entropy_o) continue;

    for(std::map<std::string, int>::iterator itr2 = m2.begin(); itr2 != m2.end(); ++itr2)
      if(freq1 > itr2->second) 
	m2.erase(itr2);
  
    if(m2.size() ==0) continue;
    print_result(itr->first, m2, entropy_o, count, freq);
  }
  
}



void 
Pronounce::read_file(void){

  while(!ifs->eof()){
    std::string line;
    std::getline(*ifs, line);
    parse_string(line);
  }

}


void
Pronounce::print_intermediate(){

  for(std::map<std::string, std::map <std::string, int> >::iterator itr = m.begin();
      itr != m.end(); ++itr){
    const std::string& key = itr->first;
    std::map<std::string, int>& m2 = itr->second;
    *ofs << key;
    for(std::map<std::string, int>::iterator value_itr = m2.begin(); value_itr != m2.end(); ++value_itr)
      *ofs << '\t' << value_itr->first << ':' << value_itr->second;
    *ofs << std::endl;
  }

}



void
Pronounce::print_result(const std::string& key, count_map& m2, double entropy_o, int count, int freq){

  *ofs << key;
  if(verbose == true){
    *ofs << '\t' << "%%entropy%%" << ':' << entropy_o;
    *ofs << '\t' << "%%count%%" << ':' << count;
  }
  if(verbose == true){
    for(count_map::iterator value_itr = m2.begin(); value_itr != m2.end(); ++value_itr){
      *ofs << '\t' << value_itr->first;
      *ofs << ':' << value_itr->second;
    }
  }else{
    for(count_map::iterator value_itr = m2.begin(); value_itr != m2.end(); ++value_itr){
      *ofs << '\t' << value_itr->first;
    }
  }
  *ofs << std::endl;
  
}

void Pronounce::run(){
  switch(task_flg) {
  case 0:
    read_file();
    print_intermediate();
    break;
  case 1:
    read_file();
    selection();
    break;
  default:
    read_file();
    selection();
    break;
  }
}


int
main(int argc, char* argv[])
{

  Pronounce p;
  char read_file[512] = "";
  char write_file[512] = "";

  double entropy = 1.2;
  int freq = 100;
  int task = -1;

  int opt;
  while((opt = getopt(argc, argv, "vVi:o:e:f:t:")) != EOF){
    switch(opt) {
    case 'e':
      if((entropy = atof(optarg)) <= 0){
	std::cerr << argv[0] << " entropy is invalid " << entropy << std::endl;
	exit(EXIT_FAILURE);
      }
      break;
    case 'f':
      if((freq = atoi(optarg)) <= 0){
	std::cerr << argv[0] << " freq is invalid " << entropy << std::endl;
	exit(EXIT_FAILURE);
      }
      break;
    case 'i':
      strncpy(read_file, optarg, 512);
    case 'o':
      strncpy(write_file, optarg,512);
      p.set_writefile(write_file);
      break;
    case 'v':
      p.set_verbose(true);
      break;
    case 'V':
      std::cerr << VERSION << " of " << PACKAGE << std::endl;
      exit(EXIT_SUCCESS);
      break;
    case 't':
      task = atoi(optarg);
      break;
    default:
      std::cerr << "Usage: " << argv[0] << USAGE << std::endl;
      exit(EXIT_SUCCESS);
    }
  }

  p.set_freq(freq);
  p.set_entropy(entropy);
  p.set_task(task);

  p.read(read_file);
  p.run();

  return 0;
}


