package zephyr.kenkyusya.lajp;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import zephyr.util.ZephyrUtil;

public class CheckLatinWords {

    private final HashSet<String> heads = new HashSet<String>();
    private final HashSet<String> morphs = new HashSet<String>();

    CheckLatinWords() {
        morphs.add("I");
        morphs.add("II");
        morphs.add("III");
        morphs.add("Gk");
        morphs.add("adj");
        morphs.add("adv");
        morphs.add("sc");
        morphs.add("dim");
        morphs.add("alqm");
        morphs.add("alqo");
        morphs.add("alqd");
        morphs.add("neut");
        morphs.add("pref");
        morphs.add("pref");
        morphs.add("pp");
        morphs.add("intr");
        morphs.add("dep");
        morphs.add("pf");
        morphs.add("inf");
        morphs.add("acc");
        morphs.add("cf");
        morphs.add("con");
        morphs.add("abl");
        morphs.add("freq");
        morphs.add("prp");
        morphs.add("inch");
        morphs.add("pass");
        morphs.add("indecl");
        morphs.add("sg");
        morphs.add("refl");
        morphs.add("prep");
        morphs.add("semi");
        morphs.add("conj");
        morphs.add("impers");
        morphs.add("subj");
        morphs.add("comp");
        morphs.add("int");
        morphs.add("pron");
        morphs.add("fut");
        morphs.add("superl");
        morphs.add("bi");
        morphs.add("tri");
        morphs.add("card");
        morphs.add("interrog");
        morphs.add("relat");
        morphs.add("ord");
        morphs.add("distrib");
        morphs.add("nom");
        morphs.add("impr");
        morphs.add("gerundiv");
        morphs.add("intens");
        morphs.add("indef");
        morphs.add("us");
        morphs.add("pers");
        morphs.add("idem");
        morphs.add("quisque");
        morphs.add("lacio");
        morphs.add("amb");
        morphs.add("cu");
        morphs.add("ejus");
        morphs.add("pleo");
        morphs.add("impf");
        morphs.add("alqos");
        morphs.add("trav");
        morphs.add("paul");
        morphs.add("deos");
        morphs.add("um");
        morphs.add("sup");
        morphs.add("loc");
        morphs.add("voc");
        morphs.add("uterque");
        morphs.add("tral");
        morphs.add("quisquam");
        //
        morphs.add("Cic");
        morphs.add("Ov");
    }

    private void loadMorph(String morphFile) throws Exception {
        InputStream in = new FileInputStream(new File(morphFile));
        BufferedReader br = new BufferedReader(new InputStreamReader(in, "ASCII"));
        String line;
        while ((line = br.readLine()) != null) {
            int idx = line.indexOf(',');
            int idx2 = line.indexOf(' ', idx + 1);
            String morph = line.substring(0, idx);
            String head = line.substring(idx + 1, idx2);
            heads.add(head);
            morphs.add(morph);
        }

        System.err.println("head=" + heads.size() + " entries, morphs=" + morphs.size()
                + " entries");
    }

    private static final String DT_ID_REGEXP = "<dt id=\"([^ \"\\[]+)";
    private static final Pattern DT_ID = Pattern.compile(DT_ID_REGEXP);

    private void loadGeorgesLaDe(String georgFile) throws Exception {
        InputStream in = new FileInputStream(new File(georgFile));
        BufferedReader br =
                new BufferedReader(new InputStreamReader(in, ZephyrUtil.SHIFT_JIS_CODE));
        String line;
        int n = heads.size();
        while ((line = br.readLine()) != null) {
            Matcher m = DT_ID.matcher(line);
            if (m.find()) {
                heads.add(m.group(1));
            }
        }

        System.err.println("head=" + heads.size() + " entries, added " + (heads.size() - n)
                + " entries");
    }

    private static final String WORD_REGEXP = "([a-zA-Z]+)";
    private static final Pattern WORD = Pattern.compile(WORD_REGEXP);

    private void loadBody(String bodyFile) throws Exception {
        InputStream in = new FileInputStream(new File(bodyFile));
        BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
        String line;
        TreeSet<String> unknown = new TreeSet<String>();
        String prevhead = "";
        while ((line = br.readLine()) != null) {
            if (line.trim().isEmpty()) {
                continue;
            }
            String dropline = ZephyrUtil.dropMarks(line);
            // /boolean bHead = (dropline.charAt(0) != ' ');
            Matcher m = WORD.matcher(dropline);
            boolean bFirst = true;
            while (m.find()) {
                String word = m.group(1);
                if (line.charAt(0) != ' ' && bFirst) {
                    if (word.compareToIgnoreCase(prevhead) < 0) {
                        System.out.println("###\tprev=" + prevhead + ", cur=" + word);
                    }
                    prevhead = word;
                    bFirst = false;
                }
                if (word.length() <= 1) {
                    continue;
                }
                if (m.start(1) > 0 && dropline.charAt(m.start(1) - 1) == '-') {
                    // "-um", "-a" etc
                    continue;
                }
                if (heads.contains(word) || morphs.contains(word)) {
                    continue;
                } else {
                    unknown.add(word);
                }
            }
        }

        for (String w : unknown) {
            System.out.println(w);
        }
    }

    public static void main(String[] args) {
        if (args.length >= 3) {
            CheckLatinWords app = new CheckLatinWords();
            try {
                app.loadMorph(args[1]);
                app.loadGeorgesLaDe(args[2]);
                app.loadBody(args[0]);
            } catch (Exception e) {
                e.printStackTrace();
            }
        } else {
            System.err
                    .println("Usage: java CheckLatinWords kenkyusya-lajp.txt whitaker.morph.txt georges-ldhd-body.html");
        }
    }

}
