/*
 * Copyright 2009-2010 the Fess Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package jp.sf.fess.transformer;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.transform.TransformerException;

import jp.sf.fess.Constants;
import jp.sf.fess.db.exentity.CrawlingConfig;
import jp.sf.fess.helper.CrawlingConfigHelper;
import jp.sf.fess.helper.CrawlingSessionHelper;
import jp.sf.fess.helper.OverlappingHostHelper;
import jp.sf.fess.helper.PathMappingHelper;

import org.apache.commons.lang.StringUtils;
import org.cyberneko.html.parsers.DOMParser;
import org.seasar.framework.container.SingletonS2Container;
import org.seasar.framework.util.SerializeUtil;
import org.seasar.framework.util.StringUtil;
import org.seasar.robot.RobotCrawlAccessException;
import org.seasar.robot.RobotSystemException;
import org.seasar.robot.entity.AccessResultData;
import org.seasar.robot.entity.ResponseData;
import org.seasar.robot.entity.ResultData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

public class FessXpathTransformer extends AbstractFessXpathTransformer {
    private static final Logger logger = LoggerFactory
            .getLogger(FessXpathTransformer.class);

    public String cacheXpath = "/HTML/BODY";

    public String contentXpath = "/HTML/BODY";

    public String anchorXpath = "//A/@href";

    public String digestXpath = "/HTML/HEAD/META[@name='description']/@content";

    public List<String> prunedTagList = new ArrayList<String>();

    public boolean prunedCacheContent = true;

    public int maxDigestLength = 200;

    public Map<String, String> convertUrlMap = new HashMap<String, String>();

    protected void putResultDataBody(Map<String, Object> dataMap, String key,
            Object value) {
        dataMap.put(key, value);
    }

    @Override
    protected void storeData(ResponseData responseData, ResultData resultData) {
        DOMParser parser = getDomParser();
        try {
            InputSource is = new InputSource(responseData.getResponseBody());
            if (responseData.getCharSet() != null) {
                is.setEncoding(responseData.getCharSet());
            }
            parser.parse(is);
        } catch (Exception e) {
            throw new RobotCrawlAccessException("Could not parse "
                    + responseData.getUrl(), e);
        }
        Document document = parser.getDocument();

        Map<String, Object> dataMap = new HashMap<String, Object>();
        for (Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
            Node value = null;
            try {
                value = getXPathAPI().selectSingleNode(document,
                        entry.getValue());
            } catch (TransformerException e) {
                logger.warn("Could not parse a value of " + entry.getKey()
                        + ":" + entry.getValue());
            }
            putResultDataBody(dataMap, entry.getKey(), value != null ? value
                    .getTextContent() : null);
        }
        putAdditionalData(dataMap, responseData, document);

        try {
            resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
        } catch (Exception e) {
            throw new RobotCrawlAccessException("Could not serialize object: "
                    + responseData.getUrl(), e);
        }
        resultData.setEncoding(charsetName);
    }

    protected void putAdditionalData(Map<String, Object> dataMap,
            ResponseData responseData, Document document) {
        CrawlingSessionHelper crawlingSessionHelper = SingletonS2Container
                .getComponent("crawlingSessionHelper");
        String sessionId = crawlingSessionHelper
                .getCanonicalSessionId(responseData.getSessionId());
        PathMappingHelper pathMappingHelper = SingletonS2Container
                .getComponent("pathMappingHelper");
        String url = pathMappingHelper.replaceUrl(sessionId, responseData
                .getUrl());

        // title
        // content
        putResultDataBody(dataMap, "content",
                normalizeContent(getSingleNodeValue(document, contentXpath,
                        true)));
        // cache 
        // TODO should I have an entire cache?
        String body = normalizeContent(getSingleNodeValue(document, cacheXpath,
                prunedCacheContent));
        putResultDataBody(dataMap, "cache", body);
        // digest
        String digest = getSingleNodeValue(document, digestXpath, false);
        putResultDataBody(dataMap, "digest", digest != null ? digest
                : Constants.DIGEST_PREFIX
                        + StringUtils.abbreviate(body, maxDigestLength));
        // segment
        putResultDataBody(dataMap, "segment", sessionId);
        // host
        putResultDataBody(dataMap, "host", getHost(url));
        // site
        putResultDataBody(dataMap, "site", getSite(url, responseData
                .getCharSet()));
        // url
        putResultDataBody(dataMap, "url", url);
        // tstamp
        putResultDataBody(dataMap, "tstamp", Long
                .toString(new Date().getTime()));
        // anchor
        putResultDataBody(dataMap, "anchor", getAnchorList(document,
                responseData.getUrl()));
        // mimetype
        putResultDataBody(dataMap, "mimetype", responseData.getMimeType());
        // contentLength
        putResultDataBody(dataMap, "contentLength", Long.toString(responseData
                .getContentLength()));
        //  lastModified
        putResultDataBody(dataMap, "lastModified", Long.toString(responseData
                .getLastModified().getTime()));
        // config
        CrawlingConfigHelper crawlingConfigHelper = SingletonS2Container
                .getComponent("crawlingConfigHelper");
        CrawlingConfig crawlingConfig = crawlingConfigHelper
                .getCrawlingConfig(responseData.getSessionId());
        // indexingTarget
        putResultDataBody(dataMap, Constants.INDEXING_TARGET, crawlingConfig
                .getIndexingTarget(url));
        //  boost
        putResultDataBody(dataMap, "boost", crawlingConfig.getDocumentBoost());
        // type: browserType
        List<String> browserTypeList = new ArrayList<String>();
        for (String browserType : crawlingConfig.getBrowserTypeValues()) {
            browserTypeList.add(browserType);
        }
        putResultDataBody(dataMap, "type", browserTypeList);
        // label: labelType
        List<String> labelTypeList = new ArrayList<String>();
        for (String labelType : crawlingConfig.getLabelTypeValues()) {
            labelTypeList.add(labelType);
        }
        putResultDataBody(dataMap, "label", labelTypeList);
        // role: roleType
        List<String> roleTypeList = new ArrayList<String>();
        for (String roleType : crawlingConfig.getRoleTypeValues()) {
            roleTypeList.add(roleType);
        }
        putResultDataBody(dataMap, "role", roleTypeList);
        // TODO date
        // TODO lang
        // id
        putResultDataBody(dataMap, "id", crawlingSessionHelper
                .generateId(dataMap));

    }

    protected String getSingleNodeValue(Document document, String xpath,
            boolean pruned) {
        Node value = null;
        try {
            value = getXPathAPI().selectSingleNode(document, xpath);
        } catch (Exception e) {
            logger.warn("Could not parse a value of " + xpath);
        }
        if (value == null) {
            return null;
        }
        if (pruned) {
            Node node = pruneNode(value.cloneNode(true));
            return node != null ? node.getTextContent() : null;
        } else {
            return value.getTextContent();
        }
    }

    protected Node pruneNode(Node node) {
        NodeList nodeList = node.getChildNodes();
        List<Node> childNodeList = new ArrayList<Node>();
        List<Node> removedNodeList = new ArrayList<Node>();
        for (int i = 0; i < nodeList.getLength(); i++) {
            Node childNode = nodeList.item(i);
            if (isPrunedTag(childNode.getNodeName())) {
                removedNodeList.add(childNode);
            } else {
                childNodeList.add(childNode);
            }
        }

        for (Node childNode : removedNodeList) {
            node.removeChild(childNode);
        }

        for (Node childNode : childNodeList) {
            pruneNode(childNode);
        }

        return node;
    }

    protected boolean isPrunedTag(String tagName) {
        for (String name : prunedTagList) {
            if (name.equalsIgnoreCase(tagName)) {
                return true;
            }
        }
        return false;
    }

    protected String getMultipleNodeValue(Document document, String xpath) {
        NodeList nodeList = null;
        StringBuilder buf = new StringBuilder(100);
        try {
            nodeList = getXPathAPI().selectNodeList(document, xpath);
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node node = nodeList.item(i);
                buf.append(node.getTextContent());
                buf.append("\n");
            }
        } catch (Exception e) {
            logger.warn("Could not parse a value of " + xpath);
        }
        return buf.toString();
    }

    protected String replaceOverlappingHost(String url) {
        try {
            // remove overlapping host
            OverlappingHostHelper overlappingHostHelper = SingletonS2Container
                    .getComponent("overlappingHostHelper");
            return overlappingHostHelper.convert(url);
        } catch (Exception e) {
            return url;
        }
    }

    protected List<String> getAnchorList(Document document, String currentUrl) {
        List<String> anchorList = new ArrayList<String>();
        String baseHref = getBaseHref(document);
        try {
            URL url = new URL(baseHref != null ? baseHref : currentUrl);
            NodeList list = getXPathAPI().selectNodeList(document, anchorXpath);
            for (int i = 0; i < list.getLength(); i++) {
                Node node = list.item(i);
                String attrValue = node.getTextContent();
                if (isValidPath(attrValue)) {
                    try {
                        URL childUrl = new URL(url, attrValue);
                        String u = normalizeUrl(childUrl.toString());
                        if (StringUtil.isNotBlank(u)) {
                            anchorList.add(replaceOverlappingHost(u));
                        }
                    } catch (MalformedURLException e) {
                    }
                }
            }
        } catch (Exception e) {
            logger.warn("Could not parse anchor tags.", e);
        }
        return anchorList;
    }

    @Override
    protected List<String> convertChildUrlList(List<String> urlList) {

        List<String> newUrlList = new ArrayList<String>();
        if (urlList != null) {
            for (String url : urlList) {
                for (Map.Entry<String, String> entry : convertUrlMap.entrySet()) {
                    url = url.replaceAll(entry.getKey(), entry.getValue());
                }

                newUrlList.add(replaceOverlappingHost(url));
            }
        }
        return newUrlList;
    }

    public void addPrunedTag(String tagName) {
        if (StringUtil.isNotBlank(tagName)) {
            prunedTagList.add(tagName);
        }
    }

    @Override
    public Object getData(AccessResultData accessResultData) {
        byte[] data = accessResultData.getData();
        if (data != null) {
            try {
                return SerializeUtil.fromBinaryToObject(data);
            } catch (Exception e) {
                throw new RobotSystemException(
                        "Could not create an instanced from bytes.", e);
            }
        }
        return new HashMap<String, Object>();
    }

    @Override
    protected boolean isValidPath(String path) {
        return super.isValidPath(path);
    }

    @Override
    protected void addChildUrlFromTagAttribute(List<String> urlList, URL url,
            String attrValue, String encoding) {
        String urlValue = attrValue.trim();
        URL childUrl;
        String u = null;
        try {
            childUrl = new URL(url, urlValue);
            u = encodeUrl(normalizeUrl(childUrl.toExternalForm()), encoding);
        } catch (MalformedURLException e) {
            int pos = urlValue.indexOf(':');
            if (pos > 0 && pos < 10) {
                u = encodeUrl(normalizeUrl(urlValue), encoding);
            }
        }

        if (u == null) {
            logger.warn("Ignored child URL: " + attrValue + " in " + url);
            return;
        }

        if (logger.isDebugEnabled()) {
            logger.debug(attrValue + " -> " + u);
        }
        if (StringUtil.isNotBlank(u)) {
            if (logger.isDebugEnabled()) {
                logger.debug("Add Child: " + u);
            }
            urlList.add(u);
        } else {
            if (logger.isDebugEnabled()) {
                logger.debug("Skip Child: " + u);
            }
        }
    }
}
