/*
 * makeda - builder for double-array dictionaries.
 *
 * Copyright (C) 1996, 1997, 2000, 2001, 
 *                            Nara Institute of Science and Technology
 *                           
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Nara Institute of 
 *      Science and Technology.
 * 4. The name Nara Institute of Science and Technology may not be used to
 *    endorse or promote products derived from this software without specific
 *    prior written permission.
 *    
 *
 * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE Nara Institute
 * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * $Id: makeda.cpp,v 1.5 2003/02/16 07:27:08 kazuma-t Exp $
 */

#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <map>
#include <cstdlib>
#include <darts.h>

const int CHAINT_OFFSET = 11;
const int CHAINT_SCALE  = (256 - 11);

struct Entry {
    std::string form;
    std::string reading;
    std::string pron;
    std::string base;
    std::string info;
    unsigned short posid;
    unsigned char inf_type;
    unsigned char inf_form;
    unsigned short weight;
    short con_tbl;
    std::string compound;
};

class IntFile {
public:
    IntFile(std::istream& fin = std::cin) : fin(fin) {};
    int getentry(Entry& entry) {
	char c;
	if (!(getstring(entry.form) && getstring(entry.reading) &&
	      getstring(entry.pron) && getstring(entry.base) &&
	      getstring(entry.info) &&
	      gettwobyte(entry.posid) &&
	      getonebyte(entry.inf_type) && getonebyte(entry.inf_form) &&
	      gettwobyte(entry.weight) && 
	      gettwobyte((unsigned short)entry.con_tbl)))
	    return !fin.eof();

	geteol(entry.compound); // XXX
	return !fin.eof();
    }

private:
    std::istream& fin;
    inline int getstring(std::string& str) {
	getline(fin, str, '\0');
	return !fin.eof();
    }
    inline int geteol(std::string& str) {
	getline(fin, str);
	return !fin.eof();
    }
    inline int getonebyte(unsigned char& num) {
	char c;
        fin.get(c);
	num = (unsigned char)c - CHAINT_OFFSET;
	return !fin.eof();
    }
    inline int gettwobyte(unsigned short& num) {
	char c0, c1;
	fin.get(c0);
	fin.get(c1);
	num = ((unsigned char)c0 - CHAINT_OFFSET) * CHAINT_SCALE
	    + (unsigned char)c1 - CHAINT_OFFSET;
	return !fin.eof();
    }
};

class DataFile {
public:
    DataFile(std::ostream& fout = std::cout) : fout(fout) {};
    int write(const Entry& entry) {
	long pos = fout.tellp();
	unsigned short info_len = (unsigned short)entry.info.size();
	fout << (unsigned char)entry.reading.size();
	fout << (unsigned char)entry.pron.size();
	fout << (unsigned char)entry.base.size();
	fout.write((char *)&info_len, sizeof(unsigned short));
	fout << entry.reading << '\0' << entry.pron << '\0';
	fout << entry.base << '\0' << entry.info << '\0';
	fout << entry.compound << '\n';

	if (fout.fail())
	    return -1;
	else
	    return pos;
    }
private:
    std::ostream& fout;
};

#define bytecpy(dist, src) \
{ memcpy(dist, &(src), sizeof(src)); (dist) += sizeof(src); }

class LexFile {
public:
    LexFile(std::ostream& fout = std::cout) : fout(fout) {};
    inline long write(size_t len, std::vector<char*>& lex_data) {
	long pos = fout.tellp();
	fout << (unsigned char)len;
	fout << (unsigned char)lex_data.size();
	for (std::vector<char*>::iterator i = lex_data.begin();
	     i != lex_data.end(); i++) {
	    fout.write(*i, 12);
	}
	if (fout.fail())
	    return -1;
	else
	    return pos;
    }
    inline static char* pack(char* str,
			     const Entry& entry, const long dat_index) {
	char *head = str;
	bytecpy(str, entry.posid);
	bytecpy(str, entry.inf_type);
	bytecpy(str, entry.inf_form);
	bytecpy(str, entry.weight);
	bytecpy(str, entry.con_tbl);
	bytecpy(str, dat_index);

	return head;
    }
private:
    std::ostream& fout;
};

int main(int argc, char *argv[])
{
    if (argc < 3) {
	std::cerr << "Usage: makeda CHADIC_INT DICNAME\n";
	exit(EXIT_FAILURE);
    }

    std::string dataname, lexname, daname;
    dataname = lexname = daname = argv[2];
    dataname += ".dat";
    lexname += ".lex";
    daname += ".da";

    std::ifstream intstream(argv[1], std::ios::in|std::ios::binary);

    if (!intstream.is_open()) {
	std::cerr << argv[1] << ": cannot open\n";
    }
    std::ofstream datastream(dataname.c_str(),
			     std::ios::out|std::ios::binary);
    std::ofstream lexstream(lexname.c_str(),
			    std::ios::out|std::ios::binary);
    if (!datastream.is_open() || !lexstream.is_open()) {
	std::cerr << argv[2] << ": cannot open\n";
    }

    Entry entry;
    IntFile intfile(intstream);
    DataFile datfile(datastream);
    LexFile lexfile(lexstream);

    typedef std::multimap<std::string, char*> Hash;
    typedef Hash::value_type HashVal;

    Hash entries;
    while (intfile.getentry(entry)) {
	long da_index = datfile.write(entry);
	char* buf = new char[12];
	entries.insert(HashVal(entry.form,
			       LexFile::pack(buf, entry, da_index)));
    }
    std::cerr << entries.size() << " entries" << std::endl;

    Hash::iterator i, last;
    int size = 0;
    char** keys = new char*[entries.size()];
    size_t* lens = new size_t[entries.size()];
    int* vals = new int[entries.size()];

    std::vector<char*> lex_data;
    i = entries.begin();
    while (i != entries.end()) {
	const std::string& key = i->first;
	last = entries.upper_bound(key);
	lex_data.clear();
	for (; i != last; i++) {
	    lex_data.push_back(i->second);
	}
	lens[size] = key.size();
	(const char*)keys[size] = key.data();
	vals[size] = lexfile.write(key.size(), lex_data);
	if (vals[size] < 0) {
	    std::cerr << "Unexpected error at " << key << std::endl;
	    exit(EXIT_FAILURE);
	}
	size++;
    }
    std::cerr << size << " keys" << std::endl;

    Darts::DoubleArray da;
    da.build(size, keys, lens, vals);

    return da.save(daname.c_str(), "wb");
}
