/*******************************************************************************
 * Copyright (c) 2008 IGA Tosiki, NTT DATA BUSINESS BRAINS Corp.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *    IGA Tosiki (NTT DATA BUSINESS BRAINS Corp.) - initial API and implementation
 *******************************************************************************/
/*
 * blanco Framework
 * Copyright (C) 2008 NTT DATA BUSINESS BRAINS CORPORATION
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 */
package blanco.html.parser;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
import org.xml.sax.SAXException;
import org.xml.sax.ext.DeclHandler;
import org.xml.sax.ext.LexicalHandler;

import blanco.commons.util.BlancoStringUtil;
import blanco.html.parser.helper.BlancoHtmlNullContentHandler;
import blanco.html.parser.util.BlancoHtmlParserUtil;
import blanco.html.parser.valueobject.BlancoHtmlAttribute;
import blanco.xml.bind.BlancoXmlAttributesImpl;
import blanco.xml.bind.valueobject.BlancoXmlAttribute;

class BlancoHtmlParserImpl implements BlancoHtmlParser {
    /**
     * [_[IuWFNgB
     */
    protected BufferedReader fReader;

    /**
     * nh[IuWFNgB
     */
    protected ContentHandler fContentHandler = null;

    protected LexicalHandler fLexicalHandler = null;

    protected DTDHandler fDTDHandler = null;

    protected DeclHandler fDeclHandler = null;

    /**
     * nh[ݒ肵܂B
     * 
     * @param nh[IuWFNg
     */
    public void setHandler(final ContentHandler handler) {
        fContentHandler = handler;
    }

    /**
     * nh[擾܂B
     * 
     * @return nh[IuWFNgB
     */
    public ContentHandler getHandler() {
        return fContentHandler;
    }

    public void setLexicalHandler(final LexicalHandler handler) {
        fLexicalHandler = handler;
    }

    public LexicalHandler getLexicalHandler() {
        return fLexicalHandler;
    }

    public void setDTDHandler(final DTDHandler handler) {
        fDTDHandler = handler;
    }

    public DTDHandler getDTDHandler() {
        return fDTDHandler;
    }

    public void setDeclHandler(final DeclHandler handler) {
        fDeclHandler = handler;
    }

    public DeclHandler getDeclHandler() {
        return fDeclHandler;
    }

    /**
     * [_[擾܂B
     * 
     * @return [_[IuWFNgB
     */
    public BufferedReader getReader() {
        return fReader;
    }

    /**
     * p[X̍ۂɗp镶GR[fBOB
     */
    protected String fEncoding;

    /**
     * p[X̍ۂɗp镶GR[fBOݒ肵܂B
     * 
     * @param encoding
     *            GR[fBOB
     */
    public void setEncoding(String encoding) {
        fEncoding = encoding;
    }

    /**
     * p[X̍ۂɗp镶GR[fBO擾܂B
     * 
     * @return GR[fBOB
     */
    public String getEncoding() {
        return fEncoding;
    }

    /**
     * ^ꂽ HTML̃oCgzp[X܂B
     * 
     * @param ͂ƂȂ
     *            HTMLoCgzB
     * @throws o͗Oꍇ
     */
    public void parse(final byte[] argInputHtml) throws IOException,
            SAXException {
        // R[h菈
        if (BlancoStringUtil.null2Blank(getEncoding()).length() == 0) {
            // R[hw肪O^ĂȂꍇɂ͎͂Ŕ肵܂B
            setEncoding(BlancoHtmlParserUtil.decideEncoding(argInputHtml));
        }

        // m肵R[hŃp[X{B
        final BufferedReader reader = new BufferedReader(new InputStreamReader(
                new ByteArrayInputStream(argInputHtml), getEncoding()));
        try {
            parse(reader);
        } finally {
            reader.close();
        }
    }

    public void parse(final BufferedReader reader) throws IOException,
            SAXException {
        fReader = reader;

        if (fContentHandler == null) {
            // nh[ZbgĂȂƗOĂ܂̂ nullnh[Zbg܂B
            fContentHandler = new BlancoHtmlNullContentHandler();
        }
        if (fLexicalHandler == null) {
            // nh[ZbgĂȂƗOĂ܂̂ nullnh[Zbg܂B
            fLexicalHandler = new BlancoHtmlNullContentHandler();
        }
        if (fDTDHandler == null) {
            // nh[ZbgĂȂƗOĂ܂̂ nullnh[Zbg܂B
            fDTDHandler = new BlancoHtmlNullContentHandler();
        }
        if (fDeclHandler == null) {
            // nh[ZbgĂȂƗOĂ܂̂ nullnh[Zbg܂B
            fDeclHandler = new BlancoHtmlNullContentHandler();
        }

        processDocument();
    }

    protected void processDocument() throws IOException, SAXException {
        fContentHandler.startDocument();

        StringBuffer characters = new StringBuffer();
        for (;;) {
            final int iRead = fReader.read();
            if (iRead < 0) {
                break;
            }

            final char cRead = (char) iRead;
            if (cRead == '<') {
                if (characters.length() > 0) {
                    final char[] wrk = parseText(characters.toString())
                            .toCharArray();
                    fContentHandler.characters(wrk, 0, wrk.length);
                    characters = new StringBuffer();
                }

                processElementOrComment();
            } else {
                characters.append(cRead);
            }
        }

        if (characters.length() > 0) {
            final char[] wrk = parseText(characters.toString()).toCharArray();
            fContentHandler.characters(wrk, 0, wrk.length);
            characters = new StringBuffer();
        }

        fContentHandler.endDocument();
    }

    /**
     * vf܂̓Rg܂B
     * 
     * @throws IOException
     * @throws SAXException
     */
    protected void processElementOrComment() throws IOException, SAXException {
        boolean isStartElement = true;
        final StringBuffer bufElement = new StringBuffer();
        final List<BlancoXmlAttribute> attributeList = new ArrayList<BlancoXmlAttribute>();
        for (;;) {
            final int iRead = fReader.read();
            if (iRead < 0) {
                // ???
                break;
            }

            final char cRead = (char) iRead;
            if (BlancoHtmlParserUtil.isWhiteSpace(cRead)) {
                // 󔒂łB瑮
                final BlancoHtmlAttribute attr = processAttribute();
                if (attr != null) {
                    attributeList.add(attr);
                }

                // ̋󔒂͑̋؂Ȃ̂Ŗ܂B
                continue;
            } else if (cRead == '!') {
                if (bufElement.length() == 0) {
                    {
                        // ςȂ ! ̂ŁAvfł͂ȂAނRg̉\B
                        fReader.mark(100);
                        if (fReader.read() == '-' && fReader.read() == '-') {
                            // RgJn܂B
                            processComment(0);
                            break;
                        }
                        fReader.reset();
                    }
                    {
                        // ςȂ ! ɂ DOCTYPE ̏ꍇB
                        // DOCTYPE Jn܂B
                        processDtd();
                        break;
                    }
                } else {
                }
            } else if (cRead == '%') {
                if (bufElement.length() == 0) {
                    // ςȂ % ɂĂRg̏ꍇ邻
                    fReader.mark(100);
                    if (fReader.read() == '-' && fReader.read() == '-') {
                        // RgJn܂B
                        processComment(1);
                        break;
                    }

                    fReader.reset();
                } else {
                }
            } else if (cRead == '/') {
                if (bufElement.length() == 0) {
                    // ςȂ / ̂ [</]̌`B
                    // ͏I^OƔf
                    isStartElement = false;
                    // ItOZbg!
                    // ŃReBj[I
                    continue;
                } else {
                    // ǂ܂ȂƕȂB
                    fReader.mark(100);
                    final char nextChar = (char) fReader.read();
                    if (nextChar == '>') {
                        // ŊJnĂɏIvf
                        fContentHandler.startElement(null, bufElement
                                .toString(), bufElement.toString(),
                                new BlancoXmlAttributesImpl(attributeList));
                        fContentHandler.endElement(null, bufElement.toString(),
                                bufElement.toString());
                        break;
                    }

                    // Ȋ^ł͂ȂBHTML?
                    fReader.reset();
                }
            } else if (cRead == '>') {
                if (isStartElement) {
                    fContentHandler.startElement(null, bufElement.toString(),
                            bufElement.toString(), new BlancoXmlAttributesImpl(
                                    attributeList));

                    // CDATA ʏKvǂ̔B
                    final String elementNameUpper = bufElement.toString()
                            .toUpperCase();
                    if (elementNameUpper.equals("SCRIPT")
                            || elementNameUpper.equals("STYLE")) {
                        processCData(elementNameUpper);
                    }
                } else {
                    fContentHandler.characters(new char[0], 0, 0);
                    fContentHandler.endElement(null, bufElement.toString(),
                            bufElement.toString());
                }
                break;
            }

            bufElement.append(cRead);
        }
    }

    protected void processComment(final int argType) throws IOException,
            SAXException {
        final StringBuffer bufComment = new StringBuffer();
        for (;;) {
            final int iRead = fReader.read();
            if (iRead < 0) {
                // ???
                break;
            }

            final char cRead = (char) iRead;
            if (cRead == '-') {
                fReader.mark(100);
                if (fReader.read() == '-' && fReader.read() == '>') {
                    // RgI
                    final char[] wrk = translateNewLine(bufComment.toString())
                            .toCharArray();
                    // Rg͕QƏȂ܂B
                    fLexicalHandler.comment(wrk, 0, wrk.length);
                    break;
                }

                fReader.reset();
            }

            bufComment.append(cRead);
        }
    }

    protected BlancoHtmlAttribute processAttribute() throws IOException {
        final StringBuffer bufName = new StringBuffer();
        final StringBuffer bufValue = new StringBuffer();

        final BlancoHtmlAttribute attrib = new BlancoHtmlAttribute();
        attrib.setQuote(0);

        for (;;) {
            // name
            fReader.mark(1);
            final int iRead = fReader.read();
            if (iRead < 0) {
                // ???
                break;
            }

            final char cRead = (char) iRead;
            if (cRead == '=') {
                break;
            }
            if (BlancoHtmlParserUtil.isWhiteSpace(cRead)) {
                // z肵ȂIB
                fReader.reset();

                if (bufName.length() > 0) {
                    attrib.setLocalName(bufName.toString());
                    attrib.setQName(bufName.toString());
                    return attrib;
                } else {
                    return null;
                }
            }
            if (cRead == '>' || cRead == '/') {
                // ^OĂ܂ꍇȂǁB
                fReader.reset();

                if (bufName.length() > 0) {
                    attrib.setLocalName(bufName.toString());
                    attrib.setQName(bufName.toString());
                    return attrib;
                } else {
                    return null;
                }
            }

            bufName.append(cRead);
        }
        outerloop: for (;;) {
            // value
            fReader.mark(1);
            final int iRead = fReader.read();
            if (iRead < 0) {
                // l̏I
                break;
            }

            final char cRead = (char) iRead;
            if (cRead == '>') {
                // vf̕LłAŏɌ̃\bhɖ߂܂B
                fReader.reset();
                break;
            }
            if (BlancoHtmlParserUtil.isWhiteSpace(cRead)) {
                // ̓Agr[gԂ̋󔒂łAŏɌ̃\bhɖ߂܂B
                fReader.reset();
                break;
            }
            if (cRead == '"') {
                // GXP[vJnB
                attrib.setQuote(2);

                for (;;) {
                    final int iReadIn = fReader.read();
                    if (iReadIn < 0) {
                        // l̏I
                        break outerloop;
                    }
                    final char cReadIn = (char) iReadIn;
                    if (cReadIn == '"') {
                        // GXP[vIB
                        break outerloop;
                    }
                    bufValue.append(cReadIn);
                }
            } else if (cRead == '\'') {
                // GXP[vJnB
                attrib.setQuote(1);

                for (;;) {
                    final int iReadIn = fReader.read();
                    if (iReadIn < 0) {
                        // l̏I
                        break outerloop;
                    }
                    final char cReadIn = (char) iReadIn;
                    if (cReadIn == '\'') {
                        // GXP[vIB
                        break outerloop;
                    }
                    bufValue.append(cReadIn);
                }
            }

            // _uNI[gŊJnĂȂĂA荞ށB
            bufValue.append(cRead);
        }

        attrib.setLocalName(bufName.toString());
        attrib.setQName(bufName.toString());
        attrib.setValue(BlancoHtmlParserUtil.decodeCharReference(bufValue
                .toString()));
        return attrib;
    }

    /**
     * SCRIPT  STYLE  HTML 4.0 ł CDATA ƂēʈKvB
     * 
     * http://www.w3.org/TR/html4/types.html#type-cdata
     * 
     * SCRIPT  STYLE ɑΉI^O܂œǂݑ܂B
     * 
     * @param elementNameUpper
     *            GgB
     * @throws IOException
     * @throws SAXException
     */
    protected void processCData(final String elementNameUpper)
            throws IOException, SAXException {
        fLexicalHandler.startCDATA();

        final StringBuffer characters = new StringBuffer();
        for (;;) {
            fReader.mark(elementNameUpper.length() + 4);
            final int iRead = fReader.read();
            if (iRead < 0) {
                // ???
                break;
            }

            final char cRead = (char) iRead;
            if (cRead == '<') {
                final char[] bufRead = new char[elementNameUpper.length() + 2];
                final int readLen = fReader.read(bufRead);
                // UZbg܂B
                fReader.reset();

                final String readString = new String(bufRead, 0, readLen)
                        .toUpperCase();
                if (readString.startsWith("/" + elementNameUpper + ">")) {
                    // Jn^OƓ̂̏I^Oɂǂ蒅܂B
                    // CDATAZNV͏Îƍl܂Bo
                    break;
                } else {
                    // I^Oł͂܂łB
                    characters.append(cRead);

                    // ߂Ă̂ 1j܂B
                    fReader.read();
                }
            } else {
                characters.append(cRead);
            }
        }

        // ~ςf[^Cxg܂B
        final char[] wrk = parseText(characters.toString()).toCharArray();
        // TODO CDATÄ̕ɂăeXgKvB
        fContentHandler.characters(wrk, 0, wrk.length);

        fLexicalHandler.endCDATA();
    }

    protected void processDtd() throws IOException, SAXException {
        String type = null;
        String name = null;
        String value = null;
        String systemId = null;

        final List<String> tokenList = parseDtdToken();
        for (String look : tokenList) {
            if (look.startsWith("value:") == false) {
                if (type == null) {
                    type = look;
                } else if (name == null) {
                    name = look;
                }
            } else {
                if (value == null) {
                    value = look.substring("value:".length());
                } else {
                    systemId = look.substring("value:".length());
                }
            }
        }

        // FIXME ܂͎Ă݂ƂxBȂ鎎̕KvB

        type = BlancoStringUtil.null2Blank(type).toUpperCase();
        if (type.equals("DOCTYPE")) {
            fLexicalHandler.startDTD(name, value, systemId);
            fLexicalHandler.endDTD();
        } else if (type.equals("ATTRIBUTE")) {
            fDeclHandler.attributeDecl(null, name, type, null, value);
        } else if (type.equals("ELEMENT")) {
            fDeclHandler.elementDecl(name, value);
        } else if (type.equals("ENTITY")) {
            fDeclHandler.internalEntityDecl(name, value);
        }
    }

    /**
     * HTML ́̕B
     * 
     * @param argInput
     * @return
     */
    protected String parseText(final String argInput) {
        return BlancoHtmlParserUtil
                .decodeCharReference(translateNewLine(argInput));
    }

    protected List<String> parseDtdToken() throws IOException {
        final List<String> result = new ArrayList<String>();
        StringBuffer bufToken = new StringBuffer();
        for (;;) {
            fReader.mark(1);
            final int iRead = fReader.read();
            if (iRead < 0) {
                break;
            }
            final char cRead = (char) iRead;
            if (cRead == ' ') {
                if (bufToken.length() > 0) {
                    result.add(bufToken.toString());
                    bufToken = new StringBuffer();
                }
            } else if (cRead == '>') {
                // I
                break;
            } else if (cRead == '"') {
                // l̊Jn
                result.add("value:" + parseDtdValue());
            } else {
                bufToken.append(cRead);
            }
        }
        if (bufToken.length() > 0) {
            result.add(bufToken.toString());
            bufToken = new StringBuffer();
        }
        return result;
    }

    protected String parseDtdValue() throws IOException {
        final StringBuffer buf = new StringBuffer();
        for (;;) {
            final int iRead = fReader.read();
            if (iRead < 0) {
                break;
            }
            final char cRead = (char) iRead;
            if (cRead == '"') {
                // l̏I
                break;
            } else {
                buf.append(cRead);
            }
        }
        return buf.toString();
    }

    /**
     * s XML ƂđÓȌ`ɕϊ܂B
     * 
     * @param argInput
     * @return
     */
    protected String translateNewLine(final String argInput) {
        // http://www.w3.org/TR/2006/REC-xml-20060816/#sec-line-ends
        // #xD #xA and any #xD that is not followed by #xA to a single #xA
        // character.

        String result = argInput;
        result = BlancoStringUtil.replace(result, "\r\n", "\n", true);
        result = BlancoStringUtil.replace(result, "\r", "\n", true);
        return result;
    }
}
