﻿using System;
using System.Collections.Generic;
using System.Text;
using ChaKi.Entity.Corpora;
using System.IO;

namespace ChaKi.Service.Readers
{
    public abstract class CabochaReader
    {
        protected Corpus m_Corpus;
        protected Lexicon m_Lexicon;

        public abstract Lexeme AddLexeme(string s);

        public void ReadFromFile(string path, string encoding)
        {
            // 現在のChar Position
            int charPos = 0;

            using (TextReader streamReader = new StreamReader(path, Encoding.GetEncoding(encoding)))
            {
                int n = 0;
                string s;
                Sentence sen = new Sentence();
                Bunsetsu currentBunsetsu = null;     // 最後に読んだ文節
                StringBuilder sb = new StringBuilder();     // Sentenceごとに平文内容を格納

                while ((s = streamReader.ReadLine()) != null)
                {
                    if (s.StartsWith("*"))
                    {
                        //文節の開始
                        try
                        {
                            Bunsetsu buns = sen.AddBunsetsu(s);
                            currentBunsetsu = buns;
                        }
                        catch (Exception)
                        {
                            Console.WriteLine(string.Format("Bunsetsu parse error: {0}", s));
                        }
                    }
                    else if (s.StartsWith("EOS"))
                    {
                        // 文の終わり
                        sen.CheckBunsetsus();   // デフォルト文節を追加。係り受け構造全体の整合性を取る。

                        if (++n % 1000 == 0)
                        {
                            Console.Write("> {0}\r", n);
                        }
                        sen.Text = sb.ToString();
                        sen.EndChar = charPos;
                        m_Corpus.AddSentence(sen);
                        // 以降のWordのために、新しいSentenceを初期化して用意する。
                        sen = new Sentence();
                        sen.StartChar = charPos;
                        currentBunsetsu = null;
                        sb = new StringBuilder();
                    }
                    else if (s.Trim().Length > 0)
                    {
                        Lexeme m = null;
                        try
                        {
                            m = this.AddLexeme(s);
                        }
                        catch (Exception)
                        {
                            Console.WriteLine(string.Format("Lexeme parse error: {0}", s));
                        }
                        if (m != null)
                        {
                            Word w = sen.AddWord(m);
                            w.StartChar = charPos;
                            w.EndChar = charPos + w.CharLength;
                            w.Bunsetsu = currentBunsetsu;
                            w.Bunsetsu = currentBunsetsu;   // currentBunsetsu はChaSenの場合はnull。
                            // 日本語の場合：デリミタなしで平文を再現
                            sb.Append(m.Surface);
                            //@todo: 英語の場合：平文を再現するにはデリミタで単語を区切る必要がある

                            charPos += m.CharLength;
                        }
                    }
                }
                Console.Write("> {0} Sentences Found.\r", n);
            }

#if false
            // BunsetsuをSegment&LinkとしてCorpusに登録
            Console.WriteLine("\nChecking Segments (Count={0})", bunsetsuList.Count);
            n = 0;
            foreach (CabochaBunsetsu buns in bunsetsuList.Values)
            {
                if (++n % 100 == 0)
                {
                    Console.Write("> {0}\r", n);
                }
                Segment seg = new Segment();
                seg.StartChar = buns.StartPos;
                seg.EndChar = buns.EndPos;
                seg.Text = "Bunsetsu";
                m_Corpus.AddSegment(seg);
                buns.Seg = seg;
            }
            Console.WriteLine("> {0}", bunsetsuList.Count);
            Console.WriteLine("Checking Links (Count={0})", bunsetsuList.Count);
            n = 0;
            foreach (CabochaBunsetsu buns in bunsetsuList.Values)
            {
                if (++n % 100 == 0)
                {
                    Console.Write("> {0}\r", n);
                }
                CabochaBunsetsu depBunsetsu = bunsetsuList.Find(buns.Sen, buns.DependsTo);
                if (depBunsetsu != null)
                {
                    Link link = new Link();
                    link.From = buns.Seg;
                    link.To = depBunsetsu.Seg;
                    link.Text = buns.DependsAs;
                    m_Corpus.AddLink(link);
                }
            }
            Console.WriteLine("> {0}", bunsetsuList.Count);
#endif
        }

#if flase
        private CabochaBunsetsu ParseBunsetsu(Sentence sen, int charPos, string s)
        {
            char[] bunsetsuSplitPattern = new char[] { ' ' };
            char[] numberPattern = new char[] { '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0' };

            // "* 0 -1D 0/0 0.00000000"の形式の行をパースする
            string[] bunsetsuparams = s.Split(bunsetsuSplitPattern);
            if (bunsetsuparams.Length < 3)
            {
                throw new InvalidDataException();
            }
            int bunsetsuPos = Int32.Parse(bunsetsuparams[1]);
            int pos = bunsetsuparams[2].LastIndexOfAny(numberPattern);
            if (pos < 0 || pos + 1 > bunsetsuparams[2].Length - 1)
            {
                throw new InvalidDataException();
            }
            int depBunsetsuId = Int32.Parse(bunsetsuparams[2].Substring(0, pos + 1));
            string depType = bunsetsuparams[2].Substring(pos + 1, bunsetsuparams[2].Length - pos - 1);

            // パラメータが正しければ、文節オブジェクトを作成
            if (bunsetsuPos < 0 || depType == null)
            {
                throw new InvalidDataException();
            }
            return new CabochaBunsetsu(sen, charPos, bunsetsuPos, depType, depBunsetsuId);
        }
#endif
    }
}
