package jp.ac.dendai.cdl.mori.wikie.io;

import java.io.*;

import jp.ac.dendai.cdl.mori.wikie.main.*;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.compress.*;
import org.apache.hadoop.mapred.*;

/**
 * XMLファイルから特定の要素毎にテキストを切り出すRecordReader
 * @author Mori
 *
 */
public class XMLRecordReader implements RecordReader<LongWritable, Text> {
    private CompressionCodecFactory compressionCodecs = null;
    /**
     * 読み出し開始位置
     */
    private long start;
    /**
     * 現在位置
     */
    private long pos;
    /**
     * 終了位置
     */
    private long end;
    /**
     * テキストを1行ずつ読み出すLineReader
     */
    private LineReader in;
    /**
     * 抽出開始タグ
     */
    private String startTag;
    /**
     * 抽出終了タグ
     */
    private String endTag;

    public XMLRecordReader(Configuration job, FileSplit split) throws IOException {
        start = split.getStart();
        end = start + split.getLength();
        startTag = job.get(Driver.PROP_START_TAG);
        endTag = job.get(Driver.PROP_END_TAG);
        final Path file = split.getPath();
        compressionCodecs = new CompressionCodecFactory(job);
        final CompressionCodec codec = compressionCodecs.getCodec(file);

        // open the file and seek to the start of the split
        FileSystem fs = file.getFileSystem(job);
        FSDataInputStream fileIn = fs.open(split.getPath());
        boolean skipFirstLine = false;
        if (codec != null) {
            in = new LineReader(codec.createInputStream(fileIn), job);
            end = Long.MAX_VALUE;
        } else {
            if (start != 0) {
                skipFirstLine = true;
                --start;
                fileIn.seek(start);
            }
            in = new LineReader(fileIn, job);
        }
        if (skipFirstLine) {  // skip first line and re-establish "start".
            start += in.readLine(new Text());
        }
        this.pos = start;
    }

    /**
     * 次の要素を抽出する。
     * @param key Mapperに与えれらるkry
     * @param value Mapperに与えられるvalue
     * @return まだ読み出しできればtrue<br>
     *         対象要素があるかどうかではなく、またテキストファイルに続きがあるかどうか。
     */
    @Override
    public synchronized boolean next(LongWritable key, Text value)
    throws IOException {
        if (pos >= end)
            return false;

        key.set(pos);
        Text tmp = new Text();
        StringBuffer page = new StringBuffer();
        boolean withinTarget = false;
        while (true) {
            int newSize = in.readLine(tmp);
            if (newSize > 0) {
                pos += newSize;
                String line = tmp.toString().trim();
                if (withinTarget) {
                    page.append(line);
                }
                if (line.matches("<" + startTag + ">")) {
                    page.append(line);
                    withinTarget = true;
                }
                else if (line.matches("</" + endTag + ">")) {
                    value.set(page.toString());
                    return true;
                }
            }
            else {
                break;
            }
        }
        return false;
    }

    @Override
    public LongWritable createKey() {
        return new LongWritable();
    }

    @Override
    public Text createValue() {
        return new Text();
    }

    @Override
    public void close() throws IOException {
        if (in != null)
            in.close();
    }

    @Override
    public long getPos() throws IOException {
        return pos;
    }

    @Override
    public float getProgress() throws IOException {
        if (start == end) {
            return 0.0f;
        } else {
            return Math.min(1.0f, (pos - start) / (float)(end - start));
        }
    }
}
