/*
 *	Qizx/Open version 0.3
 *
 *	Copyright (c) 2003-2004 Xavier C. FRANC -- All rights reserved.
 *
 *	This program is free software; you can redistribute it  and/or
 *	modify it under the terms of the GNU General Public License as
 *	published by the Free Software Foundation (see LICENSE.txt).
 */

package net.xfra.qizxopen.util;

/**
 *  A default word extractor suitable for European languages compatible	with ISO-8859-1.
 *  <p>By default, words start on a letter, accept letters/digits inside.
 *  Characters are folded to lowercase and - unless setKeepAccents(true) is called -
 *  accented letters to the corresponding non-accented letters (e.g eacute
 *  maps to 'E'.)
 *  This behavior can be redefined in subclasses by redefining
 *  isWordStart, isWordPart and mapChar.
 */
public class DefaultWordExtractor implements WordExtractor
{
    char[]  text;
    int     length;
    int     ptr;
    int     wordStart;
    char[]  mapping;
    byte[]  charType;

    public DefaultWordExtractor( ) {
	setKeepAccents(false);
    }

    public void start(char[] text, int length) {
	this.text = text;
	this.length = length;
	ptr = 0;
    }

    /**
     *	Returns true if a word may begin with this character.
     */
    public boolean isWordStart(char c) {
	return c >= 192 ? true : (charType[c] > 1);
    }

    /**
     *	Returns true if a word may contain this character.
     */
    public boolean isWordPart(char c) {
	return c >= 192 ? true : (charType[c] > 0);
    }

    public char mapChar(char c) {
	return c >= mapping.length ? c : mapping[c];
    }

    public char[] nextWord() {

	for( ; ptr < length; ptr++) {
	    char ch = text[ptr];
	    if(!isWordStart(ch))
		continue;
	    wordStart = ptr;
	    int end = ptr+1;
	    for(; end < length; end++) {
		char c = text[end];
		// a dot is part of a word if not followed by whitespace:
		if(! (isWordPart(c) ||
		      c == '.' && end < length-1 && !Character.isWhitespace(text[end+1])))
		    break;
	    }
	    ptr = end;
	    if(end - wordStart > 1) {	// always remove words of length 1
		char[] word = new char[ end - wordStart ];
		for(int iw = word.length; --iw >= 0; )
		    word[iw] = mapChar( text[wordStart + iw] );
		return word;
	    }
	}
	return null;
    }

    public char charAt(int ahead) {
	return (ptr + ahead >= length)? 0 : text[ptr + ahead];
    }

    public char nextChar() {
	if(ptr >= length)
	    return 0;
	++ ptr;
	return charAt(0);
    }

    public int wordOffset() {
	return wordStart;
    }

    public int wordLength() {
	return ptr - wordStart;
    }

    private final void mapRange(int c1, int c2, char c) {
	for( ; c1 <= c2; c1++)
	    mapping[c1] = c;
    }

    public void setKeepAccents(boolean keep) {
	charType = new byte[192];
	for(int c = '0'; c <= '9'; c++)
	    charType[c] = (byte)2;
	for(int c = 'A'; c <= 'Z'; c++)
	    charType[c] = charType[c+32] = (byte)2;
	charType['_'] = (byte)2;
	charType['-'] = (byte)1;

	mapping = new char[0x300];
	for(int c = 0; c < mapping.length; c++) {
	    char m = mapping[c] = Character.toLowerCase( (char) c);
	}
	if(keep)
	    return;

	mapRange(0x00C0, 0x00C5, 'a'); // LATIN CAPITAL LETTER A WITH GRAVE

	mapping[0x00C6] = (char) 0xe6; // LATIN CAPITAL LETTER AE
	mapping[0x00C7] = 'c';  // LATIN CAPITAL LETTER C WITH CEDILLA

	mapRange(0x00C8, 0x00CB, 'e');

	mapRange(0x00CC, 0x00CF, 'i');

	mapping[0x00D1] = 'n';  // LATIN CAPITAL LETTER N WITH TILDE

	mapRange(0x00D2, 0x00D8, 'o');

	mapRange(0x00D9, 0x00DC, 'u');

	mapping[0x00DD] = 'y';  // LATIN CAPITAL LETTER Y WITH ACUTE

	mapRange(0x00E0, 0x00E5, 'a');

	mapping[0x00E7] = 'c';  // LATIN SMALL LETTER C WITH CEDILLA

	mapRange(0x00E8, 0x00EB, 'e');

	mapRange(0x00EC, 0x00EF, 'i');

	mapping[0x00F0] = (char) 0xd0; // LATIN SMALL LETTER ETH

	mapping[0x00F1] = 'n';  // LATIN SMALL LETTER N WITH TILDE

	mapRange(0x00F2, 0x00F8, 'o');

	mapRange(0x00F9, 0x00FC, 'u');

	mapping[0x00FD] = 'y';  // LATIN SMALL LETTER Y WITH ACUTE
	mapping[0x00FE] = 0x00de;  // LATIN SMALL LETTER THORN
	mapping[0x00FF] = 'y';  // LATIN SMALL LETTER Y WITH DIAERESIS

	mapRange(0x0100, 0x0105, 'a'); // MACRON BREVE OGONEK

	mapRange(0x0106, 0x010D, 'c'); // ACUTE, CIRCUMFLEX, DOT ABOVE, CARON

	mapRange(0x010E, 0x0111, 'd'); // CARON, STROKE

	mapRange(0x0112, 0x011B, 'e');

	mapRange(0x011C, 0x0123, 'g');	// CIRCUMFLEX BREVE DOT ABOVE CEDILLA
	mapRange(0x0124, 0x0127, 'h');	// 
	mapRange(0x0128, 0x0131, 'i');

	mapping[0x0133] = 0x0132; // LATIN SMALL LIGATURE IJ

	mapRange(0x0134, 0x0135, 'j');
	mapRange(0x0136, 0x0137, 'k');

	mapRange(0x0139, 0x0142, 'l');
	mapRange(0x0143, 0x0149, 'n');

	mapRange(0x014C, 0x0151, 'o');

	mapRange(0x0154, 0x0159, 'r');
	mapRange(0x015A, 0x0161, 's');
	mapRange(0x0162, 0x0167, 't');
	mapRange(0x0168, 0x0173, 'u');
	mapRange(0x0174, 0x0175, 'w');
	mapRange(0x0176, 0x0178, 'y');
	mapRange(0x0179, 0x017E, 'z');

	mapping[0x017F] = 's';  // LATIN SMALL LETTER LONG S

	mapRange(0x0180, 0x0183, 'b');

	mapping[0x0186] = 'o';  // LATIN CAPITAL LETTER OPEN O

	mapRange(0x0187, 0x0188, 'c');

	mapRange(0x0189, 0x018C, 'd');

	mapping[0x018E] = 'e';  // LATIN CAPITAL LETTER REVERSED E
	mapping[0x0190] = 'e';  // LATIN CAPITAL LETTER OPEN E
	mapping[0x0191] = 'f';  // LATIN CAPITAL LETTER F WITH HOOK
	mapping[0x0192] = 'f';  // LATIN SMALL LETTER F WITH HOOK
	mapping[0x0193] = 'g';  // LATIN CAPITAL LETTER G WITH HOOK
	mapping[0x0197] = 'i';  // LATIN CAPITAL LETTER I WITH STROKE
	mapping[0x0198] = 'k';  // LATIN CAPITAL LETTER K WITH HOOK
	mapping[0x0199] = 'k';  // LATIN SMALL LETTER K WITH HOOK
	mapping[0x019A] = 'l';  // LATIN SMALL LETTER L WITH BAR
	mapping[0x019C] = 'm';  // LATIN CAPITAL LETTER TURNED M
	mapping[0x019D] = 'n';  // LATIN CAPITAL LETTER N WITH LEFT HOOK
	mapping[0x019E] = 'n';  // LATIN SMALL LETTER N WITH LONG RIGHT LEG
	mapping[0x019F] = 'o';  // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
	mapping[0x01A0] = 'o';  // LATIN CAPITAL LETTER O WITH HORN
	mapping[0x01A1] = 'o';  // LATIN SMALL LETTER O WITH HORN
	mapping[0x01A3] = 0x01a2;  // LATIN SMALL LETTER OI
	mapping[0x01A4] = 'p';  // LATIN CAPITAL LETTER P WITH HOOK
	mapping[0x01A5] = 'p';  // LATIN SMALL LETTER P WITH HOOK
	mapping[0x01AB] = 't';  // LATIN SMALL LETTER T WITH PALATAL HOOK
	mapping[0x01AC] = 't';  // LATIN CAPITAL LETTER T WITH HOOK
	mapping[0x01AD] = 't';  // LATIN SMALL LETTER T WITH HOOK
	mapping[0x01AE] = 't';  // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
	mapping[0x01AF] = 'u';  // LATIN CAPITAL LETTER U WITH HORN
	mapping[0x01B0] = 'u';  // LATIN SMALL LETTER U WITH HORN
	mapping[0x01B2] = 'v';  // LATIN CAPITAL LETTER V WITH HOOK
	mapping[0x01B3] = 'y';  // LATIN CAPITAL LETTER Y WITH HOOK
	mapping[0x01B4] = 'y';  // LATIN SMALL LETTER Y WITH HOOK
	mapping[0x01B5] = 'z';  // LATIN CAPITAL LETTER Z WITH STROKE
	mapping[0x01B6] = 'z';  // LATIN SMALL LETTER Z WITH STROKE

	mapping[0x01B8] = 0x01b7;  // LATIN CAPITAL LETTER EZH REVERSED
	mapping[0x01B9] = 0x01b7;  // LATIN SMALL LETTER EZH REVERSED
	mapping[0x01BA] = 0x01b7;  // LATIN SMALL LETTER EZH WITH TAIL

	mapping[0x01CD] = 'a';  // LATIN CAPITAL LETTER A WITH CARON
	mapping[0x01CE] = 'a';  // LATIN SMALL LETTER A WITH CARON
	mapping[0x01CF] = 'i';  // LATIN CAPITAL LETTER I WITH CARON
	mapping[0x01D0] = 'i';  // LATIN SMALL LETTER I WITH CARON
	mapping[0x01D1] = 'o';  // LATIN CAPITAL LETTER O WITH CARON
	mapping[0x01D2] = 'o';  // LATIN SMALL LETTER O WITH CARON

	mapRange(0x01D3, 0x01DC, 'u');

	mapping[0x01DD] = 'e';  // LATIN SMALL LETTER TURNED E
	mapping[0x01DE] = 'a';  // LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON
	mapping[0x01DF] = 'a';  // LATIN SMALL LETTER A WITH DIAERESIS AND MACRON
	mapping[0x01E0] = 'a';  // LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON
	mapping[0x01E1] = 'a';  // LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON
	mapping[0x01E2] = 0xc6;  // LATIN CAPITAL LETTER AE WITH MACRON
	mapping[0x01E3] = 0xc6;  // LATIN SMALL LETTER AE WITH MACRON
	mapping[0x01E4] = 'g';  // LATIN CAPITAL LETTER G WITH STROKE
	mapping[0x01E5] = 'g';  // LATIN SMALL LETTER G WITH STROKE
	mapping[0x01E6] = 'g';  // LATIN CAPITAL LETTER G WITH CARON
	mapping[0x01E7] = 'g';  // LATIN SMALL LETTER G WITH CARON
	mapping[0x01E8] = 'k';  // LATIN CAPITAL LETTER K WITH CARON
	mapping[0x01E9] = 'k';  // LATIN SMALL LETTER K WITH CARON
	mapping[0x01EA] = 'o';  // LATIN CAPITAL LETTER O WITH OGONEK
	mapping[0x01EB] = 'o';  // LATIN SMALL LETTER O WITH OGONEK
	mapping[0x01EC] = 'o';  // LATIN CAPITAL LETTER O WITH OGONEK AND MACRON
	mapping[0x01ED] = 'o';  // LATIN SMALL LETTER O WITH OGONEK AND MACRON
	mapping[0x01EE] = 0x01b7;  // LATIN CAPITAL LETTER EZH WITH CARON
	mapping[0x01EF] = 0x01b7;  // LATIN SMALL LETTER EZH WITH CARON
	mapping[0x01F0] = 'j';  // LATIN SMALL LETTER J WITH CARON

	mapping[0x01F3] = 0x01f1;  // LATIN SMALL LETTER DZ

	mapping[0x01F4] = 'g';  // LATIN CAPITAL LETTER G WITH ACUTE
	mapping[0x01F5] = 'g';  // LATIN SMALL LETTER G WITH ACUTE
	mapping[0x01FA] = 'a';  // LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE
	mapping[0x01FB] = 'a';  // LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE
	mapping[0x01FC] = 0xc6;  // LATIN CAPITAL LETTER AE WITH ACUTE
	mapping[0x01FD] = 0xc6;  // LATIN SMALL LETTER AE WITH ACUTE
	mapping[0x01FE] = 'o';  // LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
	mapping[0x01FF] = 'o';  // LATIN SMALL LETTER O WITH STROKE AND ACUTE
	mapping[0x0200] = 'a';  // LATIN CAPITAL LETTER A WITH DOUBLE GRAVE
	mapping[0x0201] = 'a';  // LATIN SMALL LETTER A WITH DOUBLE GRAVE
	mapping[0x0202] = 'a';  // LATIN CAPITAL LETTER A WITH INVERTED BREVE
	mapping[0x0203] = 'a';  // LATIN SMALL LETTER A WITH INVERTED BREVE
	mapping[0x0204] = 'e';  // LATIN CAPITAL LETTER E WITH DOUBLE GRAVE
	mapping[0x0205] = 'e';  // LATIN SMALL LETTER E WITH DOUBLE GRAVE
	mapping[0x0206] = 'e';  // LATIN CAPITAL LETTER E WITH INVERTED BREVE
	mapping[0x0207] = 'e';  // LATIN SMALL LETTER E WITH INVERTED BREVE
	mapping[0x0208] = 'i';  // LATIN CAPITAL LETTER I WITH DOUBLE GRAVE
	mapping[0x0209] = 'i';  // LATIN SMALL LETTER I WITH DOUBLE GRAVE
	mapping[0x020A] = 'i';  // LATIN CAPITAL LETTER I WITH INVERTED BREVE
	mapping[0x020B] = 'i';  // LATIN SMALL LETTER I WITH INVERTED BREVE
	mapping[0x020C] = 'o';  // LATIN CAPITAL LETTER O WITH DOUBLE GRAVE
	mapping[0x020D] = 'o';  // LATIN SMALL LETTER O WITH DOUBLE GRAVE
	mapping[0x020E] = 'o';  // LATIN CAPITAL LETTER O WITH INVERTED BREVE
	mapping[0x020F] = 'o';  // LATIN SMALL LETTER O WITH INVERTED BREVE
	mapping[0x0210] = 'r';  // LATIN CAPITAL LETTER R WITH DOUBLE GRAVE
	mapping[0x0211] = 'r';  // LATIN SMALL LETTER R WITH DOUBLE GRAVE
	mapping[0x0212] = 'r';  // LATIN CAPITAL LETTER R WITH INVERTED BREVE
	mapping[0x0213] = 'r';  // LATIN SMALL LETTER R WITH INVERTED BREVE
	mapping[0x0214] = 'u';  // LATIN CAPITAL LETTER U WITH DOUBLE GRAVE
	mapping[0x0215] = 'u';  // LATIN SMALL LETTER U WITH DOUBLE GRAVE
	mapping[0x0216] = 'u';  // LATIN CAPITAL LETTER U WITH INVERTED BREVE
	mapping[0x0217] = 'u';  // LATIN SMALL LETTER U WITH INVERTED BREVE
	mapping[0x0250] = 'a';  // LATIN SMALL LETTER TURNED A
	mapping[0x0253] = 'b';  // LATIN SMALL LETTER B WITH HOOK
	mapping[0x0254] = 'o';  // LATIN SMALL LETTER OPEN O
	mapping[0x0255] = 'c';  // LATIN SMALL LETTER C WITH CURL
	mapping[0x0256] = 'd';  // LATIN SMALL LETTER D WITH TAIL
	mapping[0x0257] = 'd';  // LATIN SMALL LETTER D WITH HOOK
	mapping[0x0258] = 'e';  // LATIN SMALL LETTER REVERSED E
	mapping[0x025B] = 'e';  // LATIN SMALL LETTER OPEN E
	mapping[0x025C] = 'e';  // LATIN SMALL LETTER REVERSED OPEN E
	mapping[0x025D] = 'e';  // LATIN SMALL LETTER REVERSED OPEN E WITH HOOK
	mapping[0x025E] = 'e';  // LATIN SMALL LETTER CLOSED REVERSED OPEN E
	mapping[0x025F] = 'j';  // LATIN SMALL LETTER DOTLESS J WITH STROKE
	mapping[0x0260] = 'g';  // LATIN SMALL LETTER G WITH HOOK
	mapping[0x0261] = 'g';  // LATIN SMALL LETTER SCRIPT G
	mapping[0x0262] = 'g';  // LATIN LETTER SMALL CAPITAL G
	mapping[0x0265] = 'h';  // LATIN SMALL LETTER TURNED H
	mapping[0x0266] = 'h';  // LATIN SMALL LETTER H WITH HOOK
	mapping[0x0268] = 'i';  // LATIN SMALL LETTER I WITH STROKE
	mapping[0x026A] = 'i';  // LATIN LETTER SMALL CAPITAL I
	mapping[0x026B] = 'l';  // LATIN SMALL LETTER L WITH MIDDLE TILDE
	mapping[0x026C] = 'l';  // LATIN SMALL LETTER L WITH BELT
	mapping[0x026D] = 'l';  // LATIN SMALL LETTER L WITH RETROFLEX HOOK
	mapping[0x026F] = 'm';  // LATIN SMALL LETTER TURNED M
	mapping[0x0270] = 'm';  // LATIN SMALL LETTER TURNED M WITH LONG LEG
	mapping[0x0271] = 'm';  // LATIN SMALL LETTER M WITH HOOK
	mapping[0x0272] = 'n';  // LATIN SMALL LETTER N WITH LEFT HOOK
	mapping[0x0273] = 'n';  // LATIN SMALL LETTER N WITH RETROFLEX HOOK
	mapping[0x0274] = 'n';  // LATIN LETTER SMALL CAPITAL N
	mapping[0x0275] = 'o';  // LATIN SMALL LETTER BARRED O
	mapping[0x0276] = 0x0152;  // LATIN LETTER SMALL CAPITAL OE
	mapping[0x0279] = 'r';  // LATIN SMALL LETTER TURNED R
	mapping[0x027A] = 'r';  // LATIN SMALL LETTER TURNED R WITH LONG LEG
	mapping[0x027B] = 'r';  // LATIN SMALL LETTER TURNED R WITH HOOK
	mapping[0x027C] = 'r';  // LATIN SMALL LETTER R WITH LONG LEG
	mapping[0x027D] = 'r';  // LATIN SMALL LETTER R WITH TAIL
	mapping[0x027E] = 'r';  // LATIN SMALL LETTER R WITH FISHHOOK
	mapping[0x027F] = 'r';  // LATIN SMALL LETTER REVERSED R WITH FISHHOOK
	mapping[0x0280] = 'r';  // LATIN LETTER SMALL CAPITAL R
	mapping[0x0281] = 'r';  // LATIN LETTER SMALL CAPITAL INVERTED R
	mapping[0x0282] = 's';  // LATIN SMALL LETTER S WITH HOOK
	mapping[0x0284] = 'j';  // LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK
	mapping[0x0287] = 't';  // LATIN SMALL LETTER TURNED T
	mapping[0x0288] = 't';  // LATIN SMALL LETTER T WITH RETROFLEX HOOK
	mapping[0x0289] = 'u';  // LATIN SMALL LETTER U BAR
	mapping[0x028B] = 'v';  // LATIN SMALL LETTER V WITH HOOK
	mapping[0x028C] = 'v';  // LATIN SMALL LETTER TURNED V
	mapping[0x028D] = 'w';  // LATIN SMALL LETTER TURNED W
	mapping[0x028E] = 'z';  // LATIN SMALL LETTER TURNED Y
	mapping[0x028F] = 'y';  // LATIN LETTER SMALL CAPITAL Y
	mapping[0x0290] = 'z';  // LATIN SMALL LETTER Z WITH RETROFLEX HOOK
	mapping[0x0291] = 'z';  // LATIN SMALL LETTER Z WITH CURL
	mapping[0x0292] = 0x01b7;  // LATIN SMALL LETTER EZH
	mapping[0x0293] = 0x01b7;  // LATIN SMALL LETTER EZH WITH CURL
	mapping[0x0297] = 'c';  // LATIN LETTER STRETCHED C
	mapping[0x0299] = 'b';  // LATIN LETTER SMALL CAPITAL B
	mapping[0x029A] = 'e';  // LATIN SMALL LETTER CLOSED OPEN E
	mapping[0x029B] = 'g';  // LATIN LETTER SMALL CAPITAL G WITH HOOK
	mapping[0x029C] = 'h';  // LATIN LETTER SMALL CAPITAL H
    }

    static public void main( String args[] )
    {
	DefaultWordExtractor ex = new DefaultWordExtractor();
	
        try {
            for(int a = 0; a < args.length; a++) {
		if(args[a].equals("-a")) {
		    ex.setKeepAccents(true); continue;
		}
                char[] arg = args[a].toCharArray();
		ex.start(arg, arg.length);
		char[] word = ex.nextWord();
		for(; word != null; word = ex.nextWord())
		    System.out.println(new String(word));
            }
            
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
} // end of class DefaultWordExtractor
