package org.coderepos.nori090.sen;

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import net.java.sen.SenFactory;
import net.java.sen.StringTagger;
import net.java.sen.dictionary.Token;

import com.enigmastation.classifier.WordLister;

/**
 * シンプルなベイズフィルター用ワードリスナー
 * 
 * @author nori090
 * @version $Rev: 180 $ $Date: 2008-09-27 11:55:41 +0900 (Sat, 27 Sep 2008) $
 */
public class GosenWordLister
    implements WordLister {
    protected StringTagger tagger;

    protected GosenConfig config;

    public GosenWordLister( GosenConfig config ) {
        this.config = config;
        tagger = SenFactory.getStringTagger( config.getDictionaryPath() );
    }

    @Override
    public Set<String> getUniqueWords( Object document ) {
        try {
            List<Token> tokens = tagger.analyze( document.toString() );
            Set<String> words = new HashSet<String>( tokens.size() );
            for ( Token token : tokens ) {
                words.add( token.getSurface() );
            }
            return words;
        }
        catch ( IOException ex ) {
            throw new RuntimeException( ex.getMessage(), ex );
        }
    }
}
