package org.coderepos.nori090.sen;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.java.sen.dictionary.Sentence;
import net.java.sen.dictionary.Token;
import net.java.sen.filter.StreamFilter;

/**
 * Datファイル風なレスをある程度補正してGoSenへ渡すワードリスナー
 * 
 * @author nori090
 * @version $Rev: 180 $ $Date: 2008-09-27 11:55:41 +0900 (Sat, 27 Sep 2008) $
 */
public class DatWordLister
    extends GosenWordLister {

    public DatWordLister( GosenConfig config ) {
        super( config );
        tagger.addFilter( new WordFilter() );
    }

    @Override
    public Set<String> getUniqueWords( Object document ) {
        String doc = document.toString();
        doc = doc.replaceAll( "<br> ", "\r\n" );
        doc = doc.replaceAll( "<br>", "\r\n" );
        doc = doc.replaceAll( "（", "(" );
        doc = doc.replaceAll( "）", ")" );
        doc = doc.replace( "&gt;", "" );
        return super.getUniqueWords( replace2byteAlphabet( doc ) );
    }

    String replace2byteAlphabet( String s ) {
        StringBuilder sb = new StringBuilder( s );
        for ( int i = 0; i < sb.length(); i++ ) {
            char c = sb.charAt( i );
            if ( c >= 'ａ' && c <= 'ｚ' ) {
                sb.setCharAt( i, (char) ( c - 'ａ' + 'a' ) );
            }
            else if ( c >= 'Ａ' && c <= 'Ｚ' ) {
                sb.setCharAt( i, (char) ( c - 'Ａ' + 'A' ) );
            }
        }
        return sb.toString();
    }

    class WordFilter
        implements StreamFilter {

        @Override
        public List<Token> postProcess( List<Token> tokens ) {
            ArrayList<Token> list = new ArrayList<Token>( tokens.size() );
            for ( int index = 0; index < tokens.size(); index++ ) {
                Token t = tokens.get( index );
                if ( isUrlStart( t ) ) {
                    AppendToken appendToken = appendToken4HttpUrl( tokens, index );
                    index = appendToken.nextIndex - 1;
                    list.add( appendToken.token );
                    continue;
                }
                // if ( isKaomojiStart( t ) ) {
                // AppendToken appendToken = appendToken4Kaomoji( tokens, index );
                // index = appendToken.nextIndex - 1;
                // list.add( appendToken.token );
                // continue;
                // }
                list.add( t );
            }
            // for ( Token t : list ) {
            // System.out.println( t.getSurface() );
            // }
            return list;
        }

        /**
         * @param tokens
         * @param index
         * @return
         */
        private AppendToken appendToken4Kaomoji( List<Token> tokens, int index ) {
            return null;
        }

        /**
         * 顔文字、すなわち、左括弧"("か判定する。
         * 
         * @param t
         * @return
         */
        private boolean isKaomojiStart( Token t ) {
            String surfase = t.getSurface();
            if ( surfase.equals( "(" ) ) {
                return true;
            }
            return false;
        }

        Pattern url = Pattern.compile( "[-_.!~*\\'()a-zA-Z0-9;\\/?:\\@&=+\\$,%#]+" );

        /**
         * 分割されたtoken(文字列)をhttp urlで再結合します。
         * 
         * @param tokens
         * @param index
         * @return
         */
        private AppendToken appendToken4HttpUrl( List<Token> tokens, int index ) {
            String surfase = tokens.get( index ).getSurface();
            index++;
            StringBuilder sb = new StringBuilder();
            if ( surfase.startsWith( "ttp" ) ) {
                sb.append( "h" ).append( surfase );
            }
            else {
                sb.append( surfase );
            }
            surfase = tokens.get( index ).getSurface();
            index++;
            if ( surfase.equals( "://" ) ) {
                sb.append( surfase );
            }
            else {
                Token t = new Token();
                t.setSurface( sb.toString() );
                return new AppendToken( t, index );
            }
            for ( ; index < tokens.size(); index++ ) {
                surfase = tokens.get( index ).getSurface();
                Matcher m = url.matcher( surfase );
                if ( m.find() ) {
                    sb.append( surfase );
                }
                else {
                    break;
                }
            }
            Token t = new Token();
            t.setSurface( sb.toString() );
            return new AppendToken( t, index );
        }

        /**
         * http/ttpという文字列のtokenか判定する。
         * 
         * @param t
         * @return
         */
        boolean isUrlStart( Token t ) {
            String surface = t.getSurface();
            if ( surface.startsWith( "http" ) || surface.startsWith( "ttp" ) ) {
                return true;
            }
            return false;
        }

        class AppendToken {
            Token token;

            int nextIndex;

            private AppendToken( Token token, int nextIndex ) {
                super();
                this.token = token;
                this.nextIndex = nextIndex;
            }
        }

        @Override
        public void preProcess( Sentence sentence ) {
        }
    }
}
