﻿using System;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Xml.Serialization;
using ChaKi.Entity.Corpora;
using ChaKi.Entity.Readers;
using System.Collections.Generic;

namespace ChaKi.Service.Readers
{
    public class CorpusSourceReaderFactory
    {
        public ReaderDefs ReaderDefs { get; private set; }

        private CorpusSourceReaderFactory() { }

        private static CorpusSourceReaderFactory m_Instance;

        public static CorpusSourceReaderFactory CreateInstance(string readerdefpath)
        {
            m_Instance = new CorpusSourceReaderFactory();
            m_Instance.LoadReaderDef(readerdefpath);
            return m_Instance;
        }

        public static CorpusSourceReaderFactory Instance
        {
            get
            {
                if (m_Instance == null)
                {
                    m_Instance = new CorpusSourceReaderFactory();
                    m_Instance.LoadReaderDef();
                }
                return m_Instance;
            }
        }

        /// <summary>
        /// ファイルの先頭を読んでフォーマットを自動推定し、
        /// フォーマットに適したCorpusSourceReaderを生成する。
        /// </summary>
        /// <param name="path"></param>
        /// <param name="encoding"></param>
        /// <param name="cps"></param>
        /// <returns></returns>
        public CorpusSourceReader Create(string path, string readerType, string encoding, Corpus cps, LexiconBuilder lb)
        {
            if (readerType == "Auto")
            {
                readerType = Guess(path, encoding, cps);
            }
            ReaderDef def = this.ReaderDefs.Find(readerType);
            if (def == null)
            {
                throw new Exception(string.Format("Reader type not found in the definition:{0}", readerType));
            }
            CorpusSourceReader rdr = null;
            if (def.LineFormat == "TabSeparatedLine")
            {
                rdr = new CabochaChasenReader(cps, lb);
            }
            else if (def.LineFormat == "MecabLine")
            {
                rdr = new CabochaMecabReader(cps, lb);
            }
            else if (def.LineFormat == "TextLine")
            {
                rdr = new PlainTextReader(cps);
            }
            else
            {
                throw new Exception(string.Format("Invalid Reader Type: {0}", readerType));
            }
            // 既存のrdr.LexiconBuilderの入力フォーマット(pathとreaderTypeから決まる）を変更する.
            rdr.SetFieldDefs(def.Fields);
            return rdr;
        }

        private string Guess(string path, string encoding, Corpus cps)
        {
            // 最初の100行までを読んで、以下の特徴フラグをセットする。
            int maxTabsInLine = 0;
            int maxCommasInLine = 0;
            int maxLineLength = 0;
            bool hasEOSLine = false;
            bool hasCabochaLine = false;   // "* "で始まる行があるか

            using (TextReader streamReader = new StreamReader(path, Encoding.GetEncoding(encoding)))
            {
                int n = 0;
                string s;
                while ((s = streamReader.ReadLine()) != null)
                {
                    maxLineLength = Math.Max(maxLineLength, s.Length);
                    if (s.StartsWith("**")) // Ignore Extdata lines of cabocha
                    {
                        continue;
                    }
                    if (s.StartsWith("*"))
                    {
                        hasCabochaLine = true;
                    }
                    else if (s.StartsWith("EOS"))
                    {
                        hasEOSLine = true;
                    }
                    else
                    {
                        int commas = 0;
                        int tabs = 0;
                        for (int i = 0; i < s.Length; i++)
                        {
                            if (s[i] == ',')
                            {
                                commas++;
                            }
                            else if (s[i] == '\t')
                            {
                                tabs++;
                            }
                        }
                        maxTabsInLine = Math.Max(maxTabsInLine, tabs);
                        maxCommasInLine = Math.Max(maxCommasInLine, commas);
                    }
                    if (n++ > 100)
                    {
                        break;
                    }
                }
            }

            // 判定
            if (hasEOSLine && maxTabsInLine > 3)
            {
                return "ChaSen|Cabocha";
            }
            if (hasEOSLine && maxTabsInLine > 0 && maxCommasInLine > 20)
            {
                return "Mecab|Cabocha|UniDic";
            }
            if (hasEOSLine && maxTabsInLine > 0 && maxCommasInLine > 2)
            {
                return "Mecab|Cabocha";
            }
            return "PlainText";
        }

        public void LoadReaderDef()
        {
            string path = Path.GetDirectoryName(Process.GetCurrentProcess().MainModule.FileName) + "\\ReaderDefs.xml";
            LoadReaderDef(path);
        }

        public void LoadReaderDef(string path)
        {
            XmlSerializer ser = new XmlSerializer(typeof(ReaderDefs));
            using (FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read))
            {
                this.ReaderDefs = (ReaderDefs)ser.Deserialize(fs);
            }
            // Mapped Tagの出現回数をカウントして複数回指定されるTagにPartNoを付ける.
            foreach (ReaderDef def in this.ReaderDefs.ReaderDef)
            {
                if (def.Fields == null) continue;
                Dictionary<string, int> count = new Dictionary<string, int>();
                foreach (Field f in def.Fields)
                {
                    if (f.MappedTo == null) continue;
                    foreach (MappedTo mapping in f.MappedTo)
                    {
                        if (count.ContainsKey(mapping.Tag))
                        {
                            count[mapping.Tag]++;
                        }
                        else
                        {
                            count[mapping.Tag] = 0;  // Default (分解されないProperty)
                        }
                        mapping.PartNo = count[mapping.Tag];
                    }
                }
                foreach (Field f in def.Fields)
                {
                    if (f.MappedTo == null) continue;
                    // 複数回指定されたTagはPartNoをすべて+1する.
                    foreach (MappedTo mapping in f.MappedTo)
                    {
                        if (count[mapping.Tag] > 0)
                        {
                            mapping.PartNo++;
                        }
                    }
                }
            }
        }
    }
}
