package jp.ac.dendai.cdl.mori.wikie.mapred;

import java.io.*;
import java.util.*;
import java.util.regex.*;

import jp.ac.dendai.cdl.mori.wikie.main.*;
import jp.ac.dendai.cdl.mori.wikie.parser.*;
import jp.ac.dendai.cdl.mori.wikie.util.*;

import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
/**
 * isbn処理用Mapper
 * @author Mori
 *
 */
public class ISBNMapper extends WMapper {
    private static String isbnHeader;

    @Override
    public void configure(JobConf job) {
        super.configure(job);
        isbnHeader = job.get(WikIE.PROP_ISBN_HEADER, "978");
    }

    @Override
    public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
        try {
            WPageElementHandler page = createPageHandler(value);
            String id  = page.getId();
            String text = WNormalizer.deleteNonPrintingChar(page.getText());
            List<String> isbnArrayList = getISBNCode(text, isbnHeader);
            Iterator<String> isbnItr = isbnArrayList.iterator();
            while (isbnItr.hasNext()) {
                String isbnCode = isbnItr.next();
                output.collect(new Text(id), new Text(isbnCode));
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    /**
     * 文字列中からISBNコードを取得する。
     * @param text
     * @param header
     * @return
     */
    public static List<String> getISBNCode(String text, String header) {
        List<String> result = new ArrayList<String>();
        Pattern pattern = Pattern.compile("ISBN *(([0-9]|-)+)");
        Matcher matcher = pattern.matcher(text);
        while (matcher.find()) {
            String code = matcher.group(1).replaceAll("-", "");
            if (code.length() == 10) {
                code = convertISBN10To13(code, header);
            }
            if (code.length() == 13) {
                result.add(code);
            }
        }
        return result;
    }

    /**
     * 10桁の旧規格コードを13桁の新規格に直す。
     * @param code10 10桁コード
     * @param header 新規格コードの先頭3桁。普通は"978"
     * @return
     */
    public static String convertISBN10To13(String code10, String header) {
        StringBuffer code13 = new StringBuffer(header + code10.substring(0, code10.length()-1));
        int sum = 0;
        for (int i = 0; i < code13.length(); i++) {
            Character c = new Character(code13.charAt(i));
            int n = Integer.parseInt(c.toString());
            if (i % 2 == 0) {
                sum += n * 1;
            }
            else {
                sum += n * 3;
            }
        }
        int checkDigit = 10 - sum % 10;
        if (checkDigit % 10 != 0) {
            code13.append(checkDigit);
        }
        else {
            code13.append(0);
        }
        return code13.toString();
    }
}
