package com.jware.util.xml;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileWriter;
import java.io.PrintWriter;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.Document;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;

/**
 * @author jrmt
 */
public class XHtmlUtils {

    protected static Log log = LogFactory.getLog( XHtmlUtils.class );

    private static String errorFileName = "/tmp/tidy.err";
    
    /**
     * 
     */
    public XHtmlUtils() {
        super();
    }

    public static String convertHTML2XHTML( String htmlContents ) 
	throws Exception {
    	return convertHTML2XHTML( htmlContents, true );
    }
    
    public static String convertHTML2XHTML( String htmlContents, boolean intend ) 
    	throws Exception {
                
        StringBuffer result = new StringBuffer();

        ByteArrayInputStream in = null;
        ByteArrayOutputStream out = null;

        PrintWriter pw = null;
	    try {
	    	pw = new PrintWriter(new FileWriter(errorFileName), true);
    	} catch( Exception e ){
    	    log.info(errorFileName + " -- Error file path not exist. ");
    	}
        
		Tidy tidy = new Tidy();

		//Tell Tidy to convert HTML to XML
		tidy.setXmlOut(true);
		tidy.setCharEncoding( Configuration.UTF8 );

		if( intend ){
			tidy.setIndentContent( true );
		} else {
			tidy.setIndentContent( false );
		}
		tidy.setXHTML(true);
		tidy.setQuoteNbsp( true );
	    tidy.setDocType("omit");
	    tidy.setTidyMark(false);
	    tidy.setIndentAttributes(true);
	    tidy.setQuoteAmpersand(true);
	    tidy.setNumEntities( true );
		
		try {

		    //Set file for error messages
		    if( pw != null){
		        tidy.setErrout( pw );
		    }

			//Create input and output streams
	        in = new ByteArrayInputStream( htmlContents.getBytes("UTF-8"));
	        out = new ByteArrayOutputStream(); 

			//Convert 
			Document doc = tidy.parseDOM(in, out);

			//XSLTransform transform = new XSLTransform();
			//String temp = transform.serializeDOM(doc);
			
			result.append( out.toString("UTF-8") );
			for(int i=0; i<result.length(); i++){
			    char ch = result.charAt(i);
			    if( !Character.isDefined( ch ) || ch > 0xfff0 ){
			        result.setCharAt( i, ' ');
			        if( i<result.length()-1 ){
			            char nextCh = result.charAt(i+1);
			            if( nextCh == '/' ){
					        result.setCharAt( i, '<');
			            }
			        }
			    }
			}
			
			//Clean up
			in.close();
			out.close();

		} catch (Exception e ) {
		    e.printStackTrace();
		    throw e;
		}
        
        return result.toString();
        
    }
    
}
