package com.limegroup.gnutella.util;

import com.limegroup.gnutella.Assert;

/**
 * An approximate string matcher.  Two strings are considered "approximately
 * equal" if one can be transformed into the other through some series of
 * inserts, deletes, and substitutions.<p>
 *
 * The approximate matcher has options to ignore case and whitespace.  It also
 * has switches to make it perform better by comparing strings backwards and
 * reusing a buffer.  However, these do <i>not</i> affect the match methods
 * directly; they only affect the results of the process(String) method. 
 * This method is used to preprocess strings before passing to match(..).
 * Typical use:
 *
 * <pre>
 *       String s1, s2;
 *       ApproximateMatcher matcher=new ApproximateMatcher();
 *       matcher.setIgnoreCase(true);
 *       matcher.setCompareBackwards(true);
 *       String s1p=matcher.process(s1);         //pre-process s1
 *       String s2p=matcher.process(s2);         //pre-process s2
 *       int matches=matcher.match(s1p, s2p);    //compare processed strings
 *       ...
 * </pre>
 *
 * The reason for this design is to reduce the pre-processing overhead when a
 * string is matched against many other strings.  Preprocessing really is
 * required to support the ignoreWhitespace option; it is simply not possible to
 * do the k-difference dynamic programming algorithm effienctly in one pass.
 * 
 * Note that this class is not thread-safe if the buffering constructor is
 * used.  
 */
final public class ApproximateMatcher
{
    private boolean ignoreCase=false;
    private boolean ignoreWhitespace=false;
    private boolean compareBackwards=false;
    
    /** For avoiding allocations.  This can only be used by one thread at a
     *  time.  INVARIANT: buffer!=null => buffer is a bufSize by bufSize array.
     */
    private volatile int[][] buffer;
    private volatile int bufSize;
    
    /*
     * Creates a new approximate matcher that compares respects case and
     * whitespace, and compares forwards.  Compared to ApproximateMatcher(int),
     * This constructor is useful if the matcher is used infrequently and memory
     * is at a premium.  
     */
    public ApproximateMatcher() {
        this.buffer=null;
    }

    /**
     * Like ApproximateMatcher() except that the new matcher can compare strings
     * of the given size without any significant allocations.  This is a useful
     * optimization if you need to make many comparisons with one matcher.  The
     * matcher will still be able to compare larger strings, but it will require
     * an allocation.  The buffer is not released until this is garbage
     * collected.  <b>This method breaks thread safety; only one match(..)
     * call can be done at a time with a matcher created by this constructor.
     * </b>
     */
    public ApproximateMatcher(int size) {
        bufSize=size+1;
        buffer=new int[bufSize][bufSize]; //need "margins" of 1 on each side
    }
    

    ////////////////////////////// Processing Methods ///////////////////////

    /*
     * @param ignoreCase true iff case should be ignored when matching processed
     * strings.  Default value is false.
     */
    public void setIgnoreCase(boolean ignoreCase) {
        this.ignoreCase=ignoreCase;
    }

    /*
     * @param ignoreWhitespace true iff the characters ' ' and '_' should be
     * ignored when matching processed strings.  Default value is false.
     */
    public void setIgnoreWhitespace(boolean ignoreWhitespace) {
        this.ignoreWhitespace=ignoreWhitespace;
    }

    /*
     * @param compareBackwards true iff the comparison should be done backwards
     * when matching processed strings.  This is solely an optimization if you
     * expect more differences at the end of the word than the beginning.  
     * Default value is false.
     */
    public void setCompareBackwards(boolean compareBackwards) {
        this.compareBackwards=compareBackwards;
    }
    
    /** 
     * Returns a version of s suitable for passing to match(..).  This
     * means that s could be stripped of whitespace, lower-cased, or reversed
     * depending on the calls to setIgnoreWhitespace, setIgnoreWhitespace, and
     * setCompareBackwards.  The returned value may be == to s.
     */
    public String process(String s) {
        //Optimize for special case.
        if (! (ignoreCase || compareBackwards || ignoreWhitespace))
            return s;

        StringBuffer buf=new StringBuffer(s.length());
        if (compareBackwards) {
            for (int i=0; i<s.length(); i++) {
                char c=s.charAt(s.length()-i-1);
                if (ignoreCase)
                    c=Character.toLowerCase(c);
                if (ignoreWhitespace) 
                    if (c==' ' || c=='_')
                        continue;
                buf.append(c);
            }
        } else {                  //Exactly like above, but forward.
            for (int i=0; i<s.length(); i++) {
                char c=s.charAt(i);
                if (ignoreCase)
                    c=Character.toLowerCase(c);
                if (ignoreWhitespace) 
                    if (c==' ' || c=='_')
                        continue;
                buf.append(c);
            }
        }
        return buf.toString();
    }


    ///////////////////////// Public Matching Methods //////////////////////////

    /*
     * Returns the edit distance between s1 and s2.  That is, returns the number
     * of insertions, deletions, or replacements necessary to transform s1 into
     * s2.  A value of 0 means the strings match exactly.<p>
     *
     * If you want to ignore case or whitespace, or compare backwards, s1 and s2
     * should be the return values of a call to process(..).
     */
    public final int match(String s1, String s2) {
        //Let m=s1.length(), n=s2.length(), and k be the edit difference between
        //s1 and s2.  It's possible to reduce the time from O(mn) time to O(kn)
        //time by repeated iterations of the the k-difference algorithm.  But
        //this is a bit complicated.
        return matchInternal(s1, s2, Integer.MAX_VALUE);
    }

    /**
     * Returns true if the edit distance between s1 and s2 is less than or equal
     * to maxOps.  That is, returns true if s1 can be transformed into s2
     * through no more than maxOps insertions, deletions, or replacements.  This
     * method is generally more efficient than match(..) if you only care
     * whether two strings approximately match.<p>
     *
     * If you want to ignore case or whitespace, or compare backwards, s1 and s2
     * should be the return values of a call to process(..).
     */
    public final boolean matches(String s1, String s2, int maxOps) {
        return matchInternal(s1, s2, maxOps)<=maxOps;
    }

    /** 
     * Returns true if s1 can be transformed into s2 without changing more than
     * the given fraction of s1's letters.  For example, matches(1.) is the same
     * as an exact comparison, while matches(0.) always returns true as long as
     * |s1|>=|s2|.  matches(0.9) means "s1 and s2 match pretty darn closely".<p>
     *
     * If you want to ignore case or whitespace, or compare backwards, s1 and s2
     * should be the return values of a call to process(..).
     * 
     * @requires 0.<=match<=1.
     */
    public final boolean matches(String s1, String s2, float precision) {
        int s1n=s1.length();
        int n=(int)(precision*((float)s1n));  //number UNchanged
        int maxOps=s1n-n;                     //number changed
        return matches(s1, s2, maxOps);
    }
        

    /**
     * If the edit distance between s1 and s2 is less than or equal to maxOps,
     * returns the edit distance.  Otherwise returns some number greater than
     * maxOps.
     */    
    private int matchInternal(String s1, String s2, int maxOps) {
        //Swap if necessary to ensure |s1|<=|s2|.
        if (s1.length()<=s2.length()) 
            return matchInternalProcessed(s1, s2, maxOps);
        else 
            return matchInternalProcessed(s2, s1, maxOps);
    }


    ///////////////////////////// Core algorithm //////////////////////////


    /**
     * Same as matchInternal, but with weaker precondition.
     *     @requires |s1|<=|s2|
     */
    private int matchInternalProcessed(
            String s1, String s2, final int maxOps) {
        //A classic implementation using dynamic programming.  d[i,j] is the
        //edit distance between s1[0..i-1] and s2[0..j-1] and is defined
        //recursively.  Note that there are "margins" of 1 on the left and
        //top of this matrix.  See Chapter 11 of _Algorithms on Strings, Trees,
        //and Sequences_ by Dan Gusfield for a complete discussion.
        //
        //A key optimization is that we only fill in part of the row.  This is
        //based on the observation that any maxOps-difference global alignment
        //must not contain any cell (i, i+l) or (i,i-l), where l>maxOps.
        //
        //There are two additional twists to the usual algorithm.  First, we fill in
        //the matrix anti-diagonally instead of one row at a time.  Secondly, we
        //stop if the minimum value of the last two diagonals is greater than
        //maxOps.
        final int s1n=s1.length();
        final int s2n=s2.length();
        Assert.that(s1n<=s2n);
        
        if (maxOps<=0)
            return (s1.equals(s2)) ? 0 : 1;
        //Strings of vastly differing lengths don't match.  This is necessary to
        //prevent the last return statement below from incorrectly returning
        //zero.
        else if (Math.abs(s1n-s2n) > maxOps) {
            return maxOps+1;
        }
        //If one of the strings is empty, the distance is trivial to calculate.
        else if (s1n==0) { //s2n==0 ==> s1n==0           
            return s2n;
        }
        
        //Optimization: recycle buffer for matrix if possible. 
        int[][] d;
        if (buffer!=null
                && (bufSize >= Math.max(s1n+1, s2n+1)))
            d=buffer; 
        else 
            d=new int[s1n+1][s2n+1];               //Note d[0][0]==0
        int diagonals=2*Math.min(s1n+1, s2n+1)-1
                         +Math.min(s2n-s1n, maxOps);
        int minThisDiag;              //The min value of this diagonal
        int minLastDiag=0;            //The min value of last diagonal
        
        //For each k'th anti-diagonal except first (measured from the origin)...
        for (int k=1; k<diagonals; k++) {            
            //1. Calculate indices of left corner of diagonal (i1, j1) and upper
            //right corner (i2, j2).  This is black magic.  You really need to
            //look at a diagram to see why it works.
            int i1=k/2+maxOps/2;
            int j1=k/2-maxOps/2;
            int i2=k/2-maxOps/2;
            int j2=k/2+maxOps/2;            
            if ((k%2)!=0) {              //odd k?
                if ((maxOps%2)==0) {     //even maxOps?
                    //out and away from last endpoint
                    j1++;
                    i2++;
                } else {
                    //in towards the diagonal
                    i1++;
                    j2++;
                }
            }           
            //If endpoints don't fall on board, adjust accordingly
            if (j1<0 || i1>s1n) {
                i1=Math.min(k, s1n);
                j1=k-i1;
            }
            if (i2<0 || j2>s2n) {
                j2=Math.min(k, s2n);
                i2=k-j2;
            }
            
            //2. Calculate matrix values for corners. This is just like the loop
            //below except (1) we need to be careful of array index problems 
            //and (2) we don't bother looking to the left of (i1, j1) or above 
            //(i2, j2) if it's on the outer diagonal.
            Assert.that(i1>0, "Zero i1");  //j1 may be zero
            Assert.that(j2>0, "Zero j2");  //i2 may be zero
            //   a) Look in towards diagonal
            d[i1][j1]=d[i1-1][j1]+1;
            d[i2][j2]=d[i2][j2-1]+1;                            
            //   b) Look along the diagonal, unless on edge of matrix
            if (j1>0) 
                d[i1][j1]=Math.min(d[i1][j1],
                              d[i1-1][j1-1] + diff(s1.charAt(i1-1),
                                                   s2.charAt(j1-1)));
            if (i2>0)
                d[i2][j2]=Math.min(d[i2][j2],
                              d[i2-1][j2-1] + diff(s1.charAt(i2-1),
                                                   s2.charAt(j2-1)));
            //   c) Look out away from the diagonal if "inner diagonal" or on
            //   bottom row, unless on edge of matrix.
            boolean innerDiag=(k%2)!=(maxOps%2);
            if ((innerDiag || i1==s1n) && j1>0)
                d[i1][j1]=Math.min(d[i1][j1],
                                   d[i1][j1-1]+1);            
            if (innerDiag && i2>0) 
                d[i2][j2]=Math.min(d[i2][j2],
                                   d[i2-1][j2]+1);
            minThisDiag=Math.min(d[i1][j1], d[i2][j2]);

            //3. Calculate matrix value for each element of the diagonal except
            //the endpoints...
            int i=i1-1;
            int j=j1+1;
            while (i>i2 && j<j2) {
                d[i][j]=1;
                //Fill in d[i][j] using previous calculated values
                int dij=min3(d[i-1][j-1] + diff(s1.charAt(i-1), s2.charAt(j-1)),
                             d[i-1][j]   + 1,
                             d[i][j-1]   + 1); 
                d[i][j]=dij;
                minThisDiag=Math.min(minThisDiag, dij);
                //Move up and to the right in the matrix.
                i--;
                j++;
            }
            
            //If min value on last two diags is too big, quit.
            if (minThisDiag>maxOps && minLastDiag>maxOps) {
                return minThisDiag;
            }
            minLastDiag=minThisDiag;
        }     

        return d[s1n][s2n];
    }

    /** Returns 0 if a==b, or 1 otherwise. */
    private static int diff(char a, char b) {
        if (a==b) 
            return 0;
        else 
            return 1;
    }

    private static int min3(int n1, int n2, int n3) {
        return( Math.min( n1, Math.min( n2, n3 ) ) );
    }
}

