/*
 * Decompiled with CFR 0.152.
 */
package org.seasar.robot.extractor.impl;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.seasar.framework.util.InputStreamUtil;
import org.seasar.framework.util.StringUtil;
import org.seasar.robot.RobotSystemException;
import org.seasar.robot.entity.ExtractData;
import org.seasar.robot.extractor.ExtractException;
import org.seasar.robot.extractor.Extractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class HtmlExtractor
implements Extractor {
    private static final Logger logger = LoggerFactory.getLogger(HtmlExtractor.class);
    protected String encoding = "UTF-8";
    protected Pattern metaCharsetPattern = Pattern.compile("<meta.*content\\s*=\\s*['\"].*;\\s*charset=([\\w\\d\\-_]*)['\"]\\s*/?>", 10);
    protected Pattern htmlTagPattern = Pattern.compile("<[^>]+>");

    @Override
    public ExtractData getText(InputStream in, Map<String, String> params) {
        if (in == null) {
            throw new RobotSystemException("The inputstream is null.");
        }
        try {
            BufferedInputStream bis = new BufferedInputStream(in);
            String enc = this.getEncoding(bis);
            String content = new String(InputStreamUtil.getBytes((InputStream)bis), enc);
            return new ExtractData(this.htmlTagPattern.matcher(content).replaceAll(""));
        }
        catch (Exception e) {
            throw new ExtractException(e);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected String getEncoding(BufferedInputStream bis) {
        int size = 512;
        byte[] b = new byte[512];
        try {
            String enc;
            bis.mark(size);
            int c = bis.read(b);
            if (c == -1) {
                String string = this.encoding;
                return string;
            }
            String head = new String(b, 0, c, this.encoding);
            if (StringUtil.isBlank((String)head)) {
                String e = this.encoding;
                return e;
            }
            Matcher matcher = this.metaCharsetPattern.matcher(head);
            if (matcher.find() && Charset.isSupported(enc = matcher.group(1))) {
                String string = enc;
                return string;
            }
        }
        catch (Exception e) {
            if (logger.isInfoEnabled()) {
                logger.info("Use a default encoding: " + this.encoding, (Throwable)e);
            }
        }
        finally {
            try {
                bis.reset();
            }
            catch (IOException e) {
                throw new ExtractException(e);
            }
        }
        return this.encoding;
    }

    public String getEncoding() {
        return this.encoding;
    }

    public void setEncoding(String encoding) {
        this.encoding = encoding;
    }

    public Pattern getMetaCharsetPattern() {
        return this.metaCharsetPattern;
    }

    public void setMetaCharsetPattern(Pattern metaCharsetPattern) {
        this.metaCharsetPattern = metaCharsetPattern;
    }

    public Pattern getHtmlTagPattern() {
        return this.htmlTagPattern;
    }

    public void setHtmlTagPattern(Pattern htmlTagPattern) {
        this.htmlTagPattern = htmlTagPattern;
    }
}

