#!/usr/bin/ruby
# -*- coding:utf-8 -*-


# ==============================================================================
# split_new_words
# ==============================================================================

split_new_words = Proc.new do
	file = File.new("../src/id.def", "r")
		ids = file.readlines
	file.close

	# 追加辞書のIDを決める
	ids.length.times do |i|
		s = ids[i].chomp.split(" ")

		if s[1] == "名詞,一般,*,*,*,*,*"
			$id = s[0]
			break
		end
	end

	file = File.new($filename, "r")
		lines = file.readlines
	file.close

	lines = lines.sort

	dicfile = File.new($dicname, "w")

	moz = ["", "", "", "", ""]

	lines.length.times do |i|
		s1 = lines[i].chomp.split("	")
		s2 = lines[i - 1].chomp.split("	")

		# mozcのエントリは変数に入れて収録はスキップ
		if s1[2][0] == "*"
			moz = s1
			moz[2] = moz[2][1..-1]
			next
		end

		# mozcのエントリと読みが同じで表記が異なるエントリは、
		# mozcのエントリよりコストを上げて収録
		if s1[0] == moz[0] &&
		s1[1] != moz[1] &&
		# ut辞書内で重複するエントリは削除
		s1[0..1] != s2[0..1]
			# utエントリのコストがmozcエントリのコストより低いときは高くする
			if s1[2].to_i < moz[2].to_i
				s1[2] = moz[2].to_i * 1.1
				s1[2] = s1[2].to_i.to_s
			end

			# この時点でのut辞書の並び
			# 読み 表記 コスト
			t = [s1[0], $id, $id, s1[2], s1[1]]
			dicfile.puts t.join("	")
			next
		end

		# mozcのエントリと読み+表記が異なるエントリを収録
		if s1[0..1] != moz[0..1] &&
		# ut辞書内で重複するエントリは削除
		s1[0..1] != s2[0..1]
			t = [s1[0], $id, $id, s1[2], s1[1]]
			dicfile.puts t.join("	")
		end
	end

	dicfile.close
end


# ==============================================================================
# main
# ==============================================================================

targetfiles = ARGV

if ARGV == []
	puts "Usage: ruby script.rb [FILE]"
	exit
end

targetfiles.length.times do |i|
	$filename = targetfiles[i]
	$dicname = $filename + ".new"

	split_new_words.call
end
