/*
 * Decompiled with CFR 0.152.
 */
package jp.ac.dendai.cdl.mori.wikie.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import jp.ac.dendai.cdl.mori.wikie.main.WikIE;
import jp.ac.dendai.cdl.mori.wikie.util.WEntry;
import org.apache.commons.codec.DecoderException;
import org.apache.commons.codec.net.URLCodec;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class WNormalizer {
    public static final String LANG_URL = "http://en.wikipedia.org/wiki/Special:Export/List_of_Wikipedias";
    private String baseURL;
    private Map<String, Integer> nsNumberMap;
    private Set<String> langSet;
    private Set<String> projectSet;

    public WNormalizer(String sourceFilePathStr, Set<String> projectSet, Set<String> langSet) throws IOException, UnsupportedEncodingException {
        this.nsNumberMap = this.createNSNumberMap(sourceFilePathStr);
        this.langSet = langSet;
        this.projectSet = projectSet;
    }

    public Map<String, Integer> createNSNumberMap(String xmlFilePath) throws IOException, UnsupportedEncodingException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get((Configuration)conf);
        FSDataInputStream fsdis = fs.open(new Path(xmlFilePath));
        HashMap<String, Integer> result = new HashMap<String, Integer>();
        BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)fsdis, "UTF8"));
        String line = new String();
        result.put("", WikIE.ARTICLE_NS_NUM);
        result.put("image", WikIE.IMAGE_NS_NUM);
        while (!(line = reader.readLine().trim()).equals("</namespaces>")) {
            Pattern pattern = Pattern.compile("<namespace +key=\"(-*[0-9]+)\">(.*?)</namespace>");
            Matcher matcher = pattern.matcher(line);
            if (matcher.find()) {
                int number = Integer.parseInt(matcher.group(1));
                String namespace = matcher.group(2).toLowerCase();
                result.put(namespace, number);
                continue;
            }
            pattern = Pattern.compile("<base>(http://.+?\\.wikipedia\\.org/wiki/).+?</base>", 2);
            matcher = pattern.matcher(line);
            if (!matcher.find()) continue;
            this.baseURL = new String(matcher.group(1));
        }
        reader.close();
        return result;
    }

    public String getBaseURL() {
        return this.baseURL;
    }

    public Set<String> createLangSet(String langURL) throws UnsupportedEncodingException, IOException {
        TreeSet<String> result = new TreeSet<String>();
        result.add("en");
        BufferedReader reader = new BufferedReader(new InputStreamReader(new URL(langURL).openStream(), "UTF8"));
        String line = new String();
        while ((line = reader.readLine()) != null) {
            Pattern p = Pattern.compile("\\| *\\[\\[:(.*?):\\|(.*?)\\]\\].*?");
            Matcher m = p.matcher(line);
            if (!m.find()) continue;
            result.add(m.group(1));
        }
        reader.close();
        return result;
    }

    public static String wTitleRule(String str) {
        str = str.trim();
        str = str.replaceAll("_", " ");
        str = str.replaceAll(" {2,}", " ");
        while (StringUtils.isNotBlank(str) && str.charAt(0) == ':') {
            str = str.replaceFirst(":", "");
        }
        return str.trim();
    }

    public static String deleteNonPrintingChar(String text) {
        char[] target;
        char[] cArray = new char[7];
        cArray[0] = 10;
        cArray[1] = 13;
        cArray[3] = 8206;
        cArray[4] = 8207;
        cArray[5] = 8232;
        cArray[6] = 8233;
        char[] cArray2 = target = cArray;
        int n = target.length;
        int n2 = 0;
        while (n2 < n) {
            char c = cArray2[n2];
            text = text.replaceAll(Character.toString(c), "");
            ++n2;
        }
        return text;
    }

    public static boolean isCorrectTitle(String title) {
        char[] ngc;
        char[] cArray = ngc = new char[]{'#', '<', '>', '[', ']', '|', '{', '}'};
        int n = ngc.length;
        int n2 = 0;
        while (n2 < n) {
            char n3 = cArray[n2];
            if (title.indexOf(n3) != -1) {
                return false;
            }
            ++n2;
        }
        return true;
    }

    public static String decode(String str) {
        URLCodec codec = new URLCodec("UTF8");
        Pattern pattern = Pattern.compile("(%[%a-zA-Z0-9]+)");
        Matcher matcher = pattern.matcher(str);
        while (matcher.find()) {
            try {
                String encoded = matcher.group(1);
                String decoded = codec.decode(encoded);
                if (decoded.length() >= encoded.length()) continue;
                str = str.replaceAll(encoded, decoded);
            }
            catch (DecoderException decoderException) {
                // empty catch block
            }
        }
        while (!str.equals(StringEscapeUtils.unescapeHtml(str))) {
            str = StringEscapeUtils.unescapeHtml(str);
        }
        return str;
    }

    public static String decodeSectionLink(String str) {
        return WNormalizer.decode(str.replaceAll("\\.", "%"));
    }

    public WEntry normalize(String target) {
        int nameIndex;
        if (StringUtils.isBlank(target = WNormalizer.decode(WNormalizer.wTitleRule(target)))) {
            return new WEntry();
        }
        String[] part = StringUtils.splitPreserveAllTokens(target, ":");
        int nsIndex = this.getNSStartIndex(part, nameIndex = this.getNameStartIndex(part));
        int lim = nsIndex > nameIndex ? nsIndex : nameIndex;
        int langIndex = this.getLangStartIndex(part, lim);
        lim = langIndex > lim ? langIndex : lim;
        int prIndex = this.getProjectStartIndex(part, lim);
        return this.createWEntry(part, prIndex, langIndex, nsIndex, nameIndex);
    }

    public WEntry createWEntry(String[] part, int pr, int lang, int ns, int name) {
        StringBuffer nameStr = new StringBuffer();
        int i = name;
        while (i < part.length) {
            nameStr.append(String.valueOf(part[i]) + ":");
            ++i;
        }
        nameStr.deleteCharAt(nameStr.length() - 1);
        String n = StringUtils.capitalize(nameStr.toString().trim());
        String nsStr = new String();
        int nsNumber = 0;
        if (ns != -1) {
            nsStr = part[ns].toLowerCase().trim();
            nsNumber = this.nsNumberMap.get(nsStr);
        }
        String langStr = new String();
        if (lang != -1) {
            langStr = part[lang].toLowerCase().trim();
        }
        String prStr = new String();
        if (pr != -1) {
            prStr = part[pr].toLowerCase().trim();
        }
        return new WEntry(prStr, langStr, nsStr, n, nsNumber);
    }

    public int getNameStartIndex(String[] part) {
        int pr = -1;
        int lang = -1;
        int ns = -1;
        int i = 0;
        while (i < part.length) {
            String p = part[i].toLowerCase().trim();
            if (this.projectSet.contains(p) && pr == -1 && lang == -1 && ns == -1) {
                pr = i;
            } else if (this.langSet.contains(p) && lang == -1 && ns == -1) {
                lang = i;
            } else if (this.nsNumberMap.containsKey(p) && ns == -1) {
                ns = i;
            } else if (!this.langSet.contains(p) && !this.nsNumberMap.containsKey(p) && !this.projectSet.contains(p) || ns != -1 || lang != -1 || pr != -1) {
                return i;
            }
            ++i;
        }
        return part.length - 1;
    }

    public int getNSStartIndex(String[] part, int lim) {
        if (lim == -1) {
            lim = part.length;
        }
        int i = 0;
        while (i < lim) {
            String p = part[i].toLowerCase().trim();
            if (this.nsNumberMap.containsKey(p)) {
                return i;
            }
            ++i;
        }
        return -1;
    }

    public int getLangStartIndex(String[] part, int lim) {
        if (lim == -1) {
            lim = part.length;
        }
        int i = 0;
        while (i < lim) {
            String p = part[i].toLowerCase().trim();
            if (this.langSet.contains(p)) {
                return i;
            }
            ++i;
        }
        return -1;
    }

    public int getProjectStartIndex(String[] part, int lim) {
        if (lim == -1) {
            lim = part.length;
        }
        int i = 0;
        while (i < lim) {
            String p = part[i].toLowerCase().trim();
            if (this.projectSet.contains(p)) {
                return i;
            }
            ++i;
        }
        return -1;
    }

    public boolean isLangPrefix(String str) {
        return this.langSet.contains(str);
    }

    public boolean isNamespacePrefix(String str) {
        return this.nsNumberMap.containsKey(str);
    }

    public boolean isProjectPrefix(String str) {
        return this.projectSet.contains(str);
    }

    public Iterator<String> projectItr() {
        return this.projectSet.iterator();
    }

    public Iterator<String> langItr() {
        return this.langSet.iterator();
    }

    public Iterator<Map.Entry<String, Integer>> nsNumberItr() {
        return this.nsNumberMap.entrySet().iterator();
    }
}

